1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/ScheduleDAG.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/CodeGen/RegisterScavenging.h" 27 #include "llvm/MC/MCInstrDesc.h" 28 #include "llvm/Support/Debug.h" 29 30 using namespace llvm; 31 32 // Must be at least 4 to be able to branch over minimum unconditional branch 33 // code. This is only for making it possible to write reasonably small tests for 34 // long branches. 35 static cl::opt<unsigned> 36 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 37 cl::desc("Restrict range of branch instructions (DEBUG)")); 38 39 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 40 : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} 41 42 //===----------------------------------------------------------------------===// 43 // TargetInstrInfo callbacks 44 //===----------------------------------------------------------------------===// 45 46 static unsigned getNumOperandsNoGlue(SDNode *Node) { 47 unsigned N = Node->getNumOperands(); 48 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 49 --N; 50 return N; 51 } 52 53 static SDValue findChainOperand(SDNode *Load) { 54 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 55 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 56 return LastOp; 57 } 58 59 /// \brief Returns true if both nodes have the same value for the given 60 /// operand \p Op, or if both nodes do not have this operand. 61 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 62 unsigned Opc0 = N0->getMachineOpcode(); 63 unsigned Opc1 = N1->getMachineOpcode(); 64 65 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 66 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 67 68 if (Op0Idx == -1 && Op1Idx == -1) 69 return true; 70 71 72 if ((Op0Idx == -1 && Op1Idx != -1) || 73 (Op1Idx == -1 && Op0Idx != -1)) 74 return false; 75 76 // getNamedOperandIdx returns the index for the MachineInstr's operands, 77 // which includes the result as the first operand. We are indexing into the 78 // MachineSDNode's operands, so we need to skip the result operand to get 79 // the real index. 80 --Op0Idx; 81 --Op1Idx; 82 83 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 84 } 85 86 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 87 AliasAnalysis *AA) const { 88 // TODO: The generic check fails for VALU instructions that should be 89 // rematerializable due to implicit reads of exec. We really want all of the 90 // generic logic for this except for this. 91 switch (MI.getOpcode()) { 92 case AMDGPU::V_MOV_B32_e32: 93 case AMDGPU::V_MOV_B32_e64: 94 case AMDGPU::V_MOV_B64_PSEUDO: 95 return true; 96 default: 97 return false; 98 } 99 } 100 101 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 102 int64_t &Offset0, 103 int64_t &Offset1) const { 104 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 105 return false; 106 107 unsigned Opc0 = Load0->getMachineOpcode(); 108 unsigned Opc1 = Load1->getMachineOpcode(); 109 110 // Make sure both are actually loads. 111 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 112 return false; 113 114 if (isDS(Opc0) && isDS(Opc1)) { 115 116 // FIXME: Handle this case: 117 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 118 return false; 119 120 // Check base reg. 121 if (Load0->getOperand(1) != Load1->getOperand(1)) 122 return false; 123 124 // Check chain. 125 if (findChainOperand(Load0) != findChainOperand(Load1)) 126 return false; 127 128 // Skip read2 / write2 variants for simplicity. 129 // TODO: We should report true if the used offsets are adjacent (excluded 130 // st64 versions). 131 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 132 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 133 return false; 134 135 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 136 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 137 return true; 138 } 139 140 if (isSMRD(Opc0) && isSMRD(Opc1)) { 141 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 142 143 // Check base reg. 144 if (Load0->getOperand(0) != Load1->getOperand(0)) 145 return false; 146 147 const ConstantSDNode *Load0Offset = 148 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 149 const ConstantSDNode *Load1Offset = 150 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 151 152 if (!Load0Offset || !Load1Offset) 153 return false; 154 155 // Check chain. 156 if (findChainOperand(Load0) != findChainOperand(Load1)) 157 return false; 158 159 Offset0 = Load0Offset->getZExtValue(); 160 Offset1 = Load1Offset->getZExtValue(); 161 return true; 162 } 163 164 // MUBUF and MTBUF can access the same addresses. 165 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 166 167 // MUBUF and MTBUF have vaddr at different indices. 168 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 169 findChainOperand(Load0) != findChainOperand(Load1) || 170 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 171 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 172 return false; 173 174 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 175 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 176 177 if (OffIdx0 == -1 || OffIdx1 == -1) 178 return false; 179 180 // getNamedOperandIdx returns the index for MachineInstrs. Since they 181 // inlcude the output in the operand list, but SDNodes don't, we need to 182 // subtract the index by one. 183 --OffIdx0; 184 --OffIdx1; 185 186 SDValue Off0 = Load0->getOperand(OffIdx0); 187 SDValue Off1 = Load1->getOperand(OffIdx1); 188 189 // The offset might be a FrameIndexSDNode. 190 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 191 return false; 192 193 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 194 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 195 return true; 196 } 197 198 return false; 199 } 200 201 static bool isStride64(unsigned Opc) { 202 switch (Opc) { 203 case AMDGPU::DS_READ2ST64_B32: 204 case AMDGPU::DS_READ2ST64_B64: 205 case AMDGPU::DS_WRITE2ST64_B32: 206 case AMDGPU::DS_WRITE2ST64_B64: 207 return true; 208 default: 209 return false; 210 } 211 } 212 213 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 214 int64_t &Offset, 215 const TargetRegisterInfo *TRI) const { 216 unsigned Opc = LdSt.getOpcode(); 217 218 if (isDS(LdSt)) { 219 const MachineOperand *OffsetImm = 220 getNamedOperand(LdSt, AMDGPU::OpName::offset); 221 if (OffsetImm) { 222 // Normal, single offset LDS instruction. 223 const MachineOperand *AddrReg = 224 getNamedOperand(LdSt, AMDGPU::OpName::addr); 225 226 BaseReg = AddrReg->getReg(); 227 Offset = OffsetImm->getImm(); 228 return true; 229 } 230 231 // The 2 offset instructions use offset0 and offset1 instead. We can treat 232 // these as a load with a single offset if the 2 offsets are consecutive. We 233 // will use this for some partially aligned loads. 234 const MachineOperand *Offset0Imm = 235 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 236 const MachineOperand *Offset1Imm = 237 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 238 239 uint8_t Offset0 = Offset0Imm->getImm(); 240 uint8_t Offset1 = Offset1Imm->getImm(); 241 242 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 243 // Each of these offsets is in element sized units, so we need to convert 244 // to bytes of the individual reads. 245 246 unsigned EltSize; 247 if (LdSt.mayLoad()) 248 EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; 249 else { 250 assert(LdSt.mayStore()); 251 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 252 EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); 253 } 254 255 if (isStride64(Opc)) 256 EltSize *= 64; 257 258 const MachineOperand *AddrReg = 259 getNamedOperand(LdSt, AMDGPU::OpName::addr); 260 BaseReg = AddrReg->getReg(); 261 Offset = EltSize * Offset0; 262 return true; 263 } 264 265 return false; 266 } 267 268 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 269 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 270 if (SOffset && SOffset->isReg()) 271 return false; 272 273 const MachineOperand *AddrReg = 274 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 275 if (!AddrReg) 276 return false; 277 278 const MachineOperand *OffsetImm = 279 getNamedOperand(LdSt, AMDGPU::OpName::offset); 280 BaseReg = AddrReg->getReg(); 281 Offset = OffsetImm->getImm(); 282 283 if (SOffset) // soffset can be an inline immediate. 284 Offset += SOffset->getImm(); 285 286 return true; 287 } 288 289 if (isSMRD(LdSt)) { 290 const MachineOperand *OffsetImm = 291 getNamedOperand(LdSt, AMDGPU::OpName::offset); 292 if (!OffsetImm) 293 return false; 294 295 const MachineOperand *SBaseReg = 296 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 297 BaseReg = SBaseReg->getReg(); 298 Offset = OffsetImm->getImm(); 299 return true; 300 } 301 302 if (isFLAT(LdSt)) { 303 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 304 BaseReg = AddrReg->getReg(); 305 Offset = 0; 306 return true; 307 } 308 309 return false; 310 } 311 312 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 313 MachineInstr &SecondLdSt, 314 unsigned NumLoads) const { 315 const MachineOperand *FirstDst = nullptr; 316 const MachineOperand *SecondDst = nullptr; 317 318 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 319 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 320 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 321 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 322 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 323 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 324 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 325 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 326 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 327 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 328 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 329 } 330 331 if (!FirstDst || !SecondDst) 332 return false; 333 334 // Try to limit clustering based on the total number of bytes loaded 335 // rather than the number of instructions. This is done to help reduce 336 // register pressure. The method used is somewhat inexact, though, 337 // because it assumes that all loads in the cluster will load the 338 // same number of bytes as FirstLdSt. 339 340 // The unit of this value is bytes. 341 // FIXME: This needs finer tuning. 342 unsigned LoadClusterThreshold = 16; 343 344 const MachineRegisterInfo &MRI = 345 FirstLdSt.getParent()->getParent()->getRegInfo(); 346 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 347 348 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 349 } 350 351 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 352 MachineBasicBlock::iterator MI, 353 const DebugLoc &DL, unsigned DestReg, 354 unsigned SrcReg, bool KillSrc) { 355 MachineFunction *MF = MBB.getParent(); 356 DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), 357 "illegal SGPR to VGPR copy", 358 DL, DS_Error); 359 LLVMContext &C = MF->getFunction()->getContext(); 360 C.diagnose(IllegalCopy); 361 362 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 363 .addReg(SrcReg, getKillRegState(KillSrc)); 364 } 365 366 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 367 MachineBasicBlock::iterator MI, 368 const DebugLoc &DL, unsigned DestReg, 369 unsigned SrcReg, bool KillSrc) const { 370 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 371 372 if (RC == &AMDGPU::VGPR_32RegClass) { 373 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 374 AMDGPU::SReg_32RegClass.contains(SrcReg)); 375 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 376 .addReg(SrcReg, getKillRegState(KillSrc)); 377 return; 378 } 379 380 if (RC == &AMDGPU::SReg_32_XM0RegClass || 381 RC == &AMDGPU::SReg_32RegClass) { 382 if (SrcReg == AMDGPU::SCC) { 383 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 384 .addImm(-1) 385 .addImm(0); 386 return; 387 } 388 389 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 390 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 391 return; 392 } 393 394 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 395 .addReg(SrcReg, getKillRegState(KillSrc)); 396 return; 397 } 398 399 if (RC == &AMDGPU::SReg_64RegClass) { 400 if (DestReg == AMDGPU::VCC) { 401 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 402 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 403 .addReg(SrcReg, getKillRegState(KillSrc)); 404 } else { 405 // FIXME: Hack until VReg_1 removed. 406 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 407 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 408 .addImm(0) 409 .addReg(SrcReg, getKillRegState(KillSrc)); 410 } 411 412 return; 413 } 414 415 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 416 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 417 return; 418 } 419 420 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 421 .addReg(SrcReg, getKillRegState(KillSrc)); 422 return; 423 } 424 425 if (DestReg == AMDGPU::SCC) { 426 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 427 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 428 .addReg(SrcReg, getKillRegState(KillSrc)) 429 .addImm(0); 430 return; 431 } 432 433 unsigned EltSize = 4; 434 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 435 if (RI.isSGPRClass(RC)) { 436 if (RC->getSize() > 4) { 437 Opcode = AMDGPU::S_MOV_B64; 438 EltSize = 8; 439 } else { 440 Opcode = AMDGPU::S_MOV_B32; 441 EltSize = 4; 442 } 443 444 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 445 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 446 return; 447 } 448 } 449 450 451 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 452 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 453 454 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 455 unsigned SubIdx; 456 if (Forward) 457 SubIdx = SubIndices[Idx]; 458 else 459 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 460 461 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 462 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 463 464 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 465 466 if (Idx == SubIndices.size() - 1) 467 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 468 469 if (Idx == 0) 470 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 471 472 Builder.addReg(SrcReg, RegState::Implicit); 473 } 474 } 475 476 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 477 int NewOpc; 478 479 // Try to map original to commuted opcode 480 NewOpc = AMDGPU::getCommuteRev(Opcode); 481 if (NewOpc != -1) 482 // Check if the commuted (REV) opcode exists on the target. 483 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 484 485 // Try to map commuted to original opcode 486 NewOpc = AMDGPU::getCommuteOrig(Opcode); 487 if (NewOpc != -1) 488 // Check if the original (non-REV) opcode exists on the target. 489 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 490 491 return Opcode; 492 } 493 494 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 495 496 if (DstRC->getSize() == 4) { 497 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 498 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 499 return AMDGPU::S_MOV_B64; 500 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 501 return AMDGPU::V_MOV_B64_PSEUDO; 502 } 503 return AMDGPU::COPY; 504 } 505 506 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 507 switch (Size) { 508 case 4: 509 return AMDGPU::SI_SPILL_S32_SAVE; 510 case 8: 511 return AMDGPU::SI_SPILL_S64_SAVE; 512 case 16: 513 return AMDGPU::SI_SPILL_S128_SAVE; 514 case 32: 515 return AMDGPU::SI_SPILL_S256_SAVE; 516 case 64: 517 return AMDGPU::SI_SPILL_S512_SAVE; 518 default: 519 llvm_unreachable("unknown register size"); 520 } 521 } 522 523 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 524 switch (Size) { 525 case 4: 526 return AMDGPU::SI_SPILL_V32_SAVE; 527 case 8: 528 return AMDGPU::SI_SPILL_V64_SAVE; 529 case 12: 530 return AMDGPU::SI_SPILL_V96_SAVE; 531 case 16: 532 return AMDGPU::SI_SPILL_V128_SAVE; 533 case 32: 534 return AMDGPU::SI_SPILL_V256_SAVE; 535 case 64: 536 return AMDGPU::SI_SPILL_V512_SAVE; 537 default: 538 llvm_unreachable("unknown register size"); 539 } 540 } 541 542 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 543 MachineBasicBlock::iterator MI, 544 unsigned SrcReg, bool isKill, 545 int FrameIndex, 546 const TargetRegisterClass *RC, 547 const TargetRegisterInfo *TRI) const { 548 MachineFunction *MF = MBB.getParent(); 549 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 550 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 551 DebugLoc DL = MBB.findDebugLoc(MI); 552 553 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 554 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 555 MachinePointerInfo PtrInfo 556 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 557 MachineMemOperand *MMO 558 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 559 Size, Align); 560 561 if (RI.isSGPRClass(RC)) { 562 MFI->setHasSpilledSGPRs(); 563 564 // We are only allowed to create one new instruction when spilling 565 // registers, so we need to use pseudo instruction for spilling SGPRs. 566 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); 567 568 // The SGPR spill/restore instructions only work on number sgprs, so we need 569 // to make sure we are using the correct register class. 570 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { 571 MachineRegisterInfo &MRI = MF->getRegInfo(); 572 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 573 } 574 575 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 576 .addReg(SrcReg, getKillRegState(isKill)) // data 577 .addFrameIndex(FrameIndex) // addr 578 .addMemOperand(MMO) 579 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 580 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); 581 // Add the scratch resource registers as implicit uses because we may end up 582 // needing them, and need to ensure that the reserved registers are 583 // correctly handled. 584 585 if (ST.hasScalarStores()) { 586 // m0 is used for offset to scalar stores if used to spill. 587 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 588 } 589 590 return; 591 } 592 593 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 594 LLVMContext &Ctx = MF->getFunction()->getContext(); 595 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 596 " spill register"); 597 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 598 .addReg(SrcReg); 599 600 return; 601 } 602 603 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 604 605 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 606 MFI->setHasSpilledVGPRs(); 607 BuildMI(MBB, MI, DL, get(Opcode)) 608 .addReg(SrcReg, getKillRegState(isKill)) // data 609 .addFrameIndex(FrameIndex) // addr 610 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 611 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 612 .addImm(0) // offset 613 .addMemOperand(MMO); 614 } 615 616 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 617 switch (Size) { 618 case 4: 619 return AMDGPU::SI_SPILL_S32_RESTORE; 620 case 8: 621 return AMDGPU::SI_SPILL_S64_RESTORE; 622 case 16: 623 return AMDGPU::SI_SPILL_S128_RESTORE; 624 case 32: 625 return AMDGPU::SI_SPILL_S256_RESTORE; 626 case 64: 627 return AMDGPU::SI_SPILL_S512_RESTORE; 628 default: 629 llvm_unreachable("unknown register size"); 630 } 631 } 632 633 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 634 switch (Size) { 635 case 4: 636 return AMDGPU::SI_SPILL_V32_RESTORE; 637 case 8: 638 return AMDGPU::SI_SPILL_V64_RESTORE; 639 case 12: 640 return AMDGPU::SI_SPILL_V96_RESTORE; 641 case 16: 642 return AMDGPU::SI_SPILL_V128_RESTORE; 643 case 32: 644 return AMDGPU::SI_SPILL_V256_RESTORE; 645 case 64: 646 return AMDGPU::SI_SPILL_V512_RESTORE; 647 default: 648 llvm_unreachable("unknown register size"); 649 } 650 } 651 652 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 653 MachineBasicBlock::iterator MI, 654 unsigned DestReg, int FrameIndex, 655 const TargetRegisterClass *RC, 656 const TargetRegisterInfo *TRI) const { 657 MachineFunction *MF = MBB.getParent(); 658 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 659 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 660 DebugLoc DL = MBB.findDebugLoc(MI); 661 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 662 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 663 664 MachinePointerInfo PtrInfo 665 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 666 667 MachineMemOperand *MMO = MF->getMachineMemOperand( 668 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 669 670 if (RI.isSGPRClass(RC)) { 671 // FIXME: Maybe this should not include a memoperand because it will be 672 // lowered to non-memory instructions. 673 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); 674 if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { 675 MachineRegisterInfo &MRI = MF->getRegInfo(); 676 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 677 } 678 679 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 680 .addFrameIndex(FrameIndex) // addr 681 .addMemOperand(MMO) 682 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 683 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); 684 685 if (ST.hasScalarStores()) { 686 // m0 is used for offset to scalar stores if used to spill. 687 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 688 } 689 690 return; 691 } 692 693 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 694 LLVMContext &Ctx = MF->getFunction()->getContext(); 695 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 696 " restore register"); 697 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 698 699 return; 700 } 701 702 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 703 704 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 705 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 706 .addFrameIndex(FrameIndex) // vaddr 707 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 708 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 709 .addImm(0) // offset 710 .addMemOperand(MMO); 711 } 712 713 /// \param @Offset Offset in bytes of the FrameIndex being spilled 714 unsigned SIInstrInfo::calculateLDSSpillAddress( 715 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 716 unsigned FrameOffset, unsigned Size) const { 717 MachineFunction *MF = MBB.getParent(); 718 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 719 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 720 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 721 DebugLoc DL = MBB.findDebugLoc(MI); 722 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 723 unsigned WavefrontSize = ST.getWavefrontSize(); 724 725 unsigned TIDReg = MFI->getTIDReg(); 726 if (!MFI->hasCalculatedTID()) { 727 MachineBasicBlock &Entry = MBB.getParent()->front(); 728 MachineBasicBlock::iterator Insert = Entry.front(); 729 DebugLoc DL = Insert->getDebugLoc(); 730 731 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 732 *MF); 733 if (TIDReg == AMDGPU::NoRegister) 734 return TIDReg; 735 736 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 737 WorkGroupSize > WavefrontSize) { 738 739 unsigned TIDIGXReg 740 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 741 unsigned TIDIGYReg 742 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 743 unsigned TIDIGZReg 744 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 745 unsigned InputPtrReg = 746 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 747 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 748 if (!Entry.isLiveIn(Reg)) 749 Entry.addLiveIn(Reg); 750 } 751 752 RS->enterBasicBlock(Entry); 753 // FIXME: Can we scavenge an SReg_64 and access the subregs? 754 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 755 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 756 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 757 .addReg(InputPtrReg) 758 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 759 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 760 .addReg(InputPtrReg) 761 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 762 763 // NGROUPS.X * NGROUPS.Y 764 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 765 .addReg(STmp1) 766 .addReg(STmp0); 767 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 768 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 769 .addReg(STmp1) 770 .addReg(TIDIGXReg); 771 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 772 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 773 .addReg(STmp0) 774 .addReg(TIDIGYReg) 775 .addReg(TIDReg); 776 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 777 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 778 .addReg(TIDReg) 779 .addReg(TIDIGZReg); 780 } else { 781 // Get the wave id 782 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 783 TIDReg) 784 .addImm(-1) 785 .addImm(0); 786 787 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 788 TIDReg) 789 .addImm(-1) 790 .addReg(TIDReg); 791 } 792 793 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 794 TIDReg) 795 .addImm(2) 796 .addReg(TIDReg); 797 MFI->setTIDReg(TIDReg); 798 } 799 800 // Add FrameIndex to LDS offset 801 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 802 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 803 .addImm(LDSOffset) 804 .addReg(TIDReg); 805 806 return TmpReg; 807 } 808 809 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 810 MachineBasicBlock::iterator MI, 811 int Count) const { 812 DebugLoc DL = MBB.findDebugLoc(MI); 813 while (Count > 0) { 814 int Arg; 815 if (Count >= 8) 816 Arg = 7; 817 else 818 Arg = Count - 1; 819 Count -= 8; 820 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 821 .addImm(Arg); 822 } 823 } 824 825 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 826 MachineBasicBlock::iterator MI) const { 827 insertWaitStates(MBB, MI, 1); 828 } 829 830 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 831 switch (MI.getOpcode()) { 832 default: return 1; // FIXME: Do wait states equal cycles? 833 834 case AMDGPU::S_NOP: 835 return MI.getOperand(0).getImm() + 1; 836 } 837 } 838 839 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 840 MachineBasicBlock &MBB = *MI.getParent(); 841 DebugLoc DL = MBB.findDebugLoc(MI); 842 switch (MI.getOpcode()) { 843 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 844 case AMDGPU::S_MOV_B64_term: { 845 // This is only a terminator to get the correct spill code placement during 846 // register allocation. 847 MI.setDesc(get(AMDGPU::S_MOV_B64)); 848 break; 849 } 850 case AMDGPU::S_XOR_B64_term: { 851 // This is only a terminator to get the correct spill code placement during 852 // register allocation. 853 MI.setDesc(get(AMDGPU::S_XOR_B64)); 854 break; 855 } 856 case AMDGPU::S_ANDN2_B64_term: { 857 // This is only a terminator to get the correct spill code placement during 858 // register allocation. 859 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 860 break; 861 } 862 case AMDGPU::V_MOV_B64_PSEUDO: { 863 unsigned Dst = MI.getOperand(0).getReg(); 864 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 865 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 866 867 const MachineOperand &SrcOp = MI.getOperand(1); 868 // FIXME: Will this work for 64-bit floating point immediates? 869 assert(!SrcOp.isFPImm()); 870 if (SrcOp.isImm()) { 871 APInt Imm(64, SrcOp.getImm()); 872 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 873 .addImm(Imm.getLoBits(32).getZExtValue()) 874 .addReg(Dst, RegState::Implicit | RegState::Define); 875 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 876 .addImm(Imm.getHiBits(32).getZExtValue()) 877 .addReg(Dst, RegState::Implicit | RegState::Define); 878 } else { 879 assert(SrcOp.isReg()); 880 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 881 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 882 .addReg(Dst, RegState::Implicit | RegState::Define); 883 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 884 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 885 .addReg(Dst, RegState::Implicit | RegState::Define); 886 } 887 MI.eraseFromParent(); 888 break; 889 } 890 case AMDGPU::V_MOVRELD_B32_V1: 891 case AMDGPU::V_MOVRELD_B32_V2: 892 case AMDGPU::V_MOVRELD_B32_V4: 893 case AMDGPU::V_MOVRELD_B32_V8: 894 case AMDGPU::V_MOVRELD_B32_V16: { 895 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 896 unsigned VecReg = MI.getOperand(0).getReg(); 897 bool IsUndef = MI.getOperand(1).isUndef(); 898 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 899 assert(VecReg == MI.getOperand(1).getReg()); 900 901 MachineInstr *MovRel = 902 BuildMI(MBB, MI, DL, MovRelDesc) 903 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 904 .add(MI.getOperand(2)) 905 .addReg(VecReg, RegState::ImplicitDefine) 906 .addReg(VecReg, 907 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 908 909 const int ImpDefIdx = 910 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 911 const int ImpUseIdx = ImpDefIdx + 1; 912 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 913 914 MI.eraseFromParent(); 915 break; 916 } 917 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 918 MachineFunction &MF = *MBB.getParent(); 919 unsigned Reg = MI.getOperand(0).getReg(); 920 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 921 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 922 923 // Create a bundle so these instructions won't be re-ordered by the 924 // post-RA scheduler. 925 MIBundleBuilder Bundler(MBB, MI); 926 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 927 928 // Add 32-bit offset from this instruction to the start of the 929 // constant data. 930 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 931 .addReg(RegLo) 932 .add(MI.getOperand(1))); 933 934 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 935 .addReg(RegHi); 936 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 937 MIB.addImm(0); 938 else 939 MIB.add(MI.getOperand(2)); 940 941 Bundler.append(MIB); 942 llvm::finalizeBundle(MBB, Bundler.begin()); 943 944 MI.eraseFromParent(); 945 break; 946 } 947 } 948 return true; 949 } 950 951 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 952 MachineOperand &Src0, 953 unsigned Src0OpName, 954 MachineOperand &Src1, 955 unsigned Src1OpName) const { 956 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 957 if (!Src0Mods) 958 return false; 959 960 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 961 assert(Src1Mods && 962 "All commutable instructions have both src0 and src1 modifiers"); 963 964 int Src0ModsVal = Src0Mods->getImm(); 965 int Src1ModsVal = Src1Mods->getImm(); 966 967 Src1Mods->setImm(Src0ModsVal); 968 Src0Mods->setImm(Src1ModsVal); 969 return true; 970 } 971 972 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 973 MachineOperand &RegOp, 974 MachineOperand &NonRegOp) { 975 unsigned Reg = RegOp.getReg(); 976 unsigned SubReg = RegOp.getSubReg(); 977 bool IsKill = RegOp.isKill(); 978 bool IsDead = RegOp.isDead(); 979 bool IsUndef = RegOp.isUndef(); 980 bool IsDebug = RegOp.isDebug(); 981 982 if (NonRegOp.isImm()) 983 RegOp.ChangeToImmediate(NonRegOp.getImm()); 984 else if (NonRegOp.isFI()) 985 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 986 else 987 return nullptr; 988 989 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 990 NonRegOp.setSubReg(SubReg); 991 992 return &MI; 993 } 994 995 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 996 unsigned Src0Idx, 997 unsigned Src1Idx) const { 998 assert(!NewMI && "this should never be used"); 999 1000 unsigned Opc = MI.getOpcode(); 1001 int CommutedOpcode = commuteOpcode(Opc); 1002 if (CommutedOpcode == -1) 1003 return nullptr; 1004 1005 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1006 static_cast<int>(Src0Idx) && 1007 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1008 static_cast<int>(Src1Idx) && 1009 "inconsistency with findCommutedOpIndices"); 1010 1011 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1012 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1013 1014 MachineInstr *CommutedMI = nullptr; 1015 if (Src0.isReg() && Src1.isReg()) { 1016 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1017 // Be sure to copy the source modifiers to the right place. 1018 CommutedMI 1019 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1020 } 1021 1022 } else if (Src0.isReg() && !Src1.isReg()) { 1023 // src0 should always be able to support any operand type, so no need to 1024 // check operand legality. 1025 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1026 } else if (!Src0.isReg() && Src1.isReg()) { 1027 if (isOperandLegal(MI, Src1Idx, &Src0)) 1028 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1029 } else { 1030 // FIXME: Found two non registers to commute. This does happen. 1031 return nullptr; 1032 } 1033 1034 1035 if (CommutedMI) { 1036 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1037 Src1, AMDGPU::OpName::src1_modifiers); 1038 1039 CommutedMI->setDesc(get(CommutedOpcode)); 1040 } 1041 1042 return CommutedMI; 1043 } 1044 1045 // This needs to be implemented because the source modifiers may be inserted 1046 // between the true commutable operands, and the base 1047 // TargetInstrInfo::commuteInstruction uses it. 1048 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1049 unsigned &SrcOpIdx1) const { 1050 if (!MI.isCommutable()) 1051 return false; 1052 1053 unsigned Opc = MI.getOpcode(); 1054 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1055 if (Src0Idx == -1) 1056 return false; 1057 1058 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1059 if (Src1Idx == -1) 1060 return false; 1061 1062 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1063 } 1064 1065 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1066 int64_t BrOffset) const { 1067 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1068 // block is unanalyzable. 1069 assert(BranchOp != AMDGPU::S_SETPC_B64); 1070 1071 // Convert to dwords. 1072 BrOffset /= 4; 1073 1074 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1075 // from the next instruction. 1076 BrOffset -= 1; 1077 1078 return isIntN(BranchOffsetBits, BrOffset); 1079 } 1080 1081 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1082 const MachineInstr &MI) const { 1083 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1084 // This would be a difficult analysis to perform, but can always be legal so 1085 // there's no need to analyze it. 1086 return nullptr; 1087 } 1088 1089 return MI.getOperand(0).getMBB(); 1090 } 1091 1092 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1093 MachineBasicBlock &DestBB, 1094 const DebugLoc &DL, 1095 int64_t BrOffset, 1096 RegScavenger *RS) const { 1097 assert(RS && "RegScavenger required for long branching"); 1098 assert(MBB.empty() && 1099 "new block should be inserted for expanding unconditional branch"); 1100 assert(MBB.pred_size() == 1); 1101 1102 MachineFunction *MF = MBB.getParent(); 1103 MachineRegisterInfo &MRI = MF->getRegInfo(); 1104 1105 // FIXME: Virtual register workaround for RegScavenger not working with empty 1106 // blocks. 1107 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1108 1109 auto I = MBB.end(); 1110 1111 // We need to compute the offset relative to the instruction immediately after 1112 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1113 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1114 1115 // TODO: Handle > 32-bit block address. 1116 if (BrOffset >= 0) { 1117 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1118 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1119 .addReg(PCReg, 0, AMDGPU::sub0) 1120 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1121 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1122 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1123 .addReg(PCReg, 0, AMDGPU::sub1) 1124 .addImm(0); 1125 } else { 1126 // Backwards branch. 1127 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1128 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1129 .addReg(PCReg, 0, AMDGPU::sub0) 1130 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1131 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1132 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1133 .addReg(PCReg, 0, AMDGPU::sub1) 1134 .addImm(0); 1135 } 1136 1137 // Insert the indirect branch after the other terminator. 1138 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1139 .addReg(PCReg); 1140 1141 // FIXME: If spilling is necessary, this will fail because this scavenger has 1142 // no emergency stack slots. It is non-trivial to spill in this situation, 1143 // because the restore code needs to be specially placed after the 1144 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1145 // block. 1146 // 1147 // If a spill is needed for the pc register pair, we need to insert a spill 1148 // restore block right before the destination block, and insert a short branch 1149 // into the old destination block's fallthrough predecessor. 1150 // e.g.: 1151 // 1152 // s_cbranch_scc0 skip_long_branch: 1153 // 1154 // long_branch_bb: 1155 // spill s[8:9] 1156 // s_getpc_b64 s[8:9] 1157 // s_add_u32 s8, s8, restore_bb 1158 // s_addc_u32 s9, s9, 0 1159 // s_setpc_b64 s[8:9] 1160 // 1161 // skip_long_branch: 1162 // foo; 1163 // 1164 // ..... 1165 // 1166 // dest_bb_fallthrough_predecessor: 1167 // bar; 1168 // s_branch dest_bb 1169 // 1170 // restore_bb: 1171 // restore s[8:9] 1172 // fallthrough dest_bb 1173 /// 1174 // dest_bb: 1175 // buzz; 1176 1177 RS->enterBasicBlockEnd(MBB); 1178 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1179 MachineBasicBlock::iterator(GetPC), 0); 1180 MRI.replaceRegWith(PCReg, Scav); 1181 MRI.clearVirtRegs(); 1182 RS->setRegUsed(Scav); 1183 1184 return 4 + 8 + 4 + 4; 1185 } 1186 1187 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1188 switch (Cond) { 1189 case SIInstrInfo::SCC_TRUE: 1190 return AMDGPU::S_CBRANCH_SCC1; 1191 case SIInstrInfo::SCC_FALSE: 1192 return AMDGPU::S_CBRANCH_SCC0; 1193 case SIInstrInfo::VCCNZ: 1194 return AMDGPU::S_CBRANCH_VCCNZ; 1195 case SIInstrInfo::VCCZ: 1196 return AMDGPU::S_CBRANCH_VCCZ; 1197 case SIInstrInfo::EXECNZ: 1198 return AMDGPU::S_CBRANCH_EXECNZ; 1199 case SIInstrInfo::EXECZ: 1200 return AMDGPU::S_CBRANCH_EXECZ; 1201 default: 1202 llvm_unreachable("invalid branch predicate"); 1203 } 1204 } 1205 1206 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1207 switch (Opcode) { 1208 case AMDGPU::S_CBRANCH_SCC0: 1209 return SCC_FALSE; 1210 case AMDGPU::S_CBRANCH_SCC1: 1211 return SCC_TRUE; 1212 case AMDGPU::S_CBRANCH_VCCNZ: 1213 return VCCNZ; 1214 case AMDGPU::S_CBRANCH_VCCZ: 1215 return VCCZ; 1216 case AMDGPU::S_CBRANCH_EXECNZ: 1217 return EXECNZ; 1218 case AMDGPU::S_CBRANCH_EXECZ: 1219 return EXECZ; 1220 default: 1221 return INVALID_BR; 1222 } 1223 } 1224 1225 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1226 MachineBasicBlock::iterator I, 1227 MachineBasicBlock *&TBB, 1228 MachineBasicBlock *&FBB, 1229 SmallVectorImpl<MachineOperand> &Cond, 1230 bool AllowModify) const { 1231 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1232 // Unconditional Branch 1233 TBB = I->getOperand(0).getMBB(); 1234 return false; 1235 } 1236 1237 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1238 if (Pred == INVALID_BR) 1239 return true; 1240 1241 MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); 1242 Cond.push_back(MachineOperand::CreateImm(Pred)); 1243 Cond.push_back(I->getOperand(1)); // Save the branch register. 1244 1245 ++I; 1246 1247 if (I == MBB.end()) { 1248 // Conditional branch followed by fall-through. 1249 TBB = CondBB; 1250 return false; 1251 } 1252 1253 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1254 TBB = CondBB; 1255 FBB = I->getOperand(0).getMBB(); 1256 return false; 1257 } 1258 1259 return true; 1260 } 1261 1262 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1263 MachineBasicBlock *&FBB, 1264 SmallVectorImpl<MachineOperand> &Cond, 1265 bool AllowModify) const { 1266 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1267 if (I == MBB.end()) 1268 return false; 1269 1270 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1271 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1272 1273 ++I; 1274 1275 // TODO: Should be able to treat as fallthrough? 1276 if (I == MBB.end()) 1277 return true; 1278 1279 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1280 return true; 1281 1282 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1283 1284 // Specifically handle the case where the conditional branch is to the same 1285 // destination as the mask branch. e.g. 1286 // 1287 // si_mask_branch BB8 1288 // s_cbranch_execz BB8 1289 // s_cbranch BB9 1290 // 1291 // This is required to understand divergent loops which may need the branches 1292 // to be relaxed. 1293 if (TBB != MaskBrDest || Cond.empty()) 1294 return true; 1295 1296 auto Pred = Cond[0].getImm(); 1297 return (Pred != EXECZ && Pred != EXECNZ); 1298 } 1299 1300 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1301 int *BytesRemoved) const { 1302 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1303 1304 unsigned Count = 0; 1305 unsigned RemovedSize = 0; 1306 while (I != MBB.end()) { 1307 MachineBasicBlock::iterator Next = std::next(I); 1308 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1309 I = Next; 1310 continue; 1311 } 1312 1313 RemovedSize += getInstSizeInBytes(*I); 1314 I->eraseFromParent(); 1315 ++Count; 1316 I = Next; 1317 } 1318 1319 if (BytesRemoved) 1320 *BytesRemoved = RemovedSize; 1321 1322 return Count; 1323 } 1324 1325 // Copy the flags onto the implicit condition register operand. 1326 static void preserveCondRegFlags(MachineOperand &CondReg, 1327 const MachineOperand &OrigCond) { 1328 CondReg.setIsUndef(OrigCond.isUndef()); 1329 CondReg.setIsKill(OrigCond.isKill()); 1330 } 1331 1332 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1333 MachineBasicBlock *TBB, 1334 MachineBasicBlock *FBB, 1335 ArrayRef<MachineOperand> Cond, 1336 const DebugLoc &DL, 1337 int *BytesAdded) const { 1338 1339 if (!FBB && Cond.empty()) { 1340 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1341 .addMBB(TBB); 1342 if (BytesAdded) 1343 *BytesAdded = 4; 1344 return 1; 1345 } 1346 1347 assert(TBB && Cond[0].isImm()); 1348 1349 unsigned Opcode 1350 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1351 1352 if (!FBB) { 1353 Cond[1].isUndef(); 1354 MachineInstr *CondBr = 1355 BuildMI(&MBB, DL, get(Opcode)) 1356 .addMBB(TBB); 1357 1358 // Copy the flags onto the implicit condition register operand. 1359 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1360 1361 if (BytesAdded) 1362 *BytesAdded = 4; 1363 return 1; 1364 } 1365 1366 assert(TBB && FBB); 1367 1368 MachineInstr *CondBr = 1369 BuildMI(&MBB, DL, get(Opcode)) 1370 .addMBB(TBB); 1371 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1372 .addMBB(FBB); 1373 1374 MachineOperand &CondReg = CondBr->getOperand(1); 1375 CondReg.setIsUndef(Cond[1].isUndef()); 1376 CondReg.setIsKill(Cond[1].isKill()); 1377 1378 if (BytesAdded) 1379 *BytesAdded = 8; 1380 1381 return 2; 1382 } 1383 1384 bool SIInstrInfo::reverseBranchCondition( 1385 SmallVectorImpl<MachineOperand> &Cond) const { 1386 assert(Cond.size() == 2); 1387 Cond[0].setImm(-Cond[0].getImm()); 1388 return false; 1389 } 1390 1391 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1392 ArrayRef<MachineOperand> Cond, 1393 unsigned TrueReg, unsigned FalseReg, 1394 int &CondCycles, 1395 int &TrueCycles, int &FalseCycles) const { 1396 switch (Cond[0].getImm()) { 1397 case VCCNZ: 1398 case VCCZ: { 1399 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1400 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1401 assert(MRI.getRegClass(FalseReg) == RC); 1402 1403 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1404 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1405 1406 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1407 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1408 } 1409 case SCC_TRUE: 1410 case SCC_FALSE: { 1411 // FIXME: We could insert for VGPRs if we could replace the original compare 1412 // with a vector one. 1413 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1414 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1415 assert(MRI.getRegClass(FalseReg) == RC); 1416 1417 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1418 1419 // Multiples of 8 can do s_cselect_b64 1420 if (NumInsts % 2 == 0) 1421 NumInsts /= 2; 1422 1423 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1424 return RI.isSGPRClass(RC); 1425 } 1426 default: 1427 return false; 1428 } 1429 } 1430 1431 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1432 MachineBasicBlock::iterator I, const DebugLoc &DL, 1433 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1434 unsigned TrueReg, unsigned FalseReg) const { 1435 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1436 if (Pred == VCCZ || Pred == SCC_FALSE) { 1437 Pred = static_cast<BranchPredicate>(-Pred); 1438 std::swap(TrueReg, FalseReg); 1439 } 1440 1441 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1442 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1443 unsigned DstSize = DstRC->getSize(); 1444 1445 if (DstSize == 4) { 1446 unsigned SelOp = Pred == SCC_TRUE ? 1447 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1448 1449 // Instruction's operands are backwards from what is expected. 1450 MachineInstr *Select = 1451 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1452 .addReg(FalseReg) 1453 .addReg(TrueReg); 1454 1455 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1456 return; 1457 } 1458 1459 if (DstSize == 8 && Pred == SCC_TRUE) { 1460 MachineInstr *Select = 1461 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1462 .addReg(FalseReg) 1463 .addReg(TrueReg); 1464 1465 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1466 return; 1467 } 1468 1469 static const int16_t Sub0_15[] = { 1470 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1471 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1472 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1473 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1474 }; 1475 1476 static const int16_t Sub0_15_64[] = { 1477 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1478 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1479 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1480 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1481 }; 1482 1483 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1484 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1485 const int16_t *SubIndices = Sub0_15; 1486 int NElts = DstSize / 4; 1487 1488 // 64-bit select is only avaialble for SALU. 1489 if (Pred == SCC_TRUE) { 1490 SelOp = AMDGPU::S_CSELECT_B64; 1491 EltRC = &AMDGPU::SGPR_64RegClass; 1492 SubIndices = Sub0_15_64; 1493 1494 assert(NElts % 2 == 0); 1495 NElts /= 2; 1496 } 1497 1498 MachineInstrBuilder MIB = BuildMI( 1499 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1500 1501 I = MIB->getIterator(); 1502 1503 SmallVector<unsigned, 8> Regs; 1504 for (int Idx = 0; Idx != NElts; ++Idx) { 1505 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1506 Regs.push_back(DstElt); 1507 1508 unsigned SubIdx = SubIndices[Idx]; 1509 1510 MachineInstr *Select = 1511 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1512 .addReg(FalseReg, 0, SubIdx) 1513 .addReg(TrueReg, 0, SubIdx); 1514 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1515 1516 MIB.addReg(DstElt) 1517 .addImm(SubIdx); 1518 } 1519 } 1520 1521 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 1522 switch (MI.getOpcode()) { 1523 case AMDGPU::V_MOV_B32_e32: 1524 case AMDGPU::V_MOV_B32_e64: 1525 case AMDGPU::V_MOV_B64_PSEUDO: { 1526 // If there are additional implicit register operands, this may be used for 1527 // register indexing so the source register operand isn't simply copied. 1528 unsigned NumOps = MI.getDesc().getNumOperands() + 1529 MI.getDesc().getNumImplicitUses(); 1530 1531 return MI.getNumOperands() == NumOps; 1532 } 1533 case AMDGPU::S_MOV_B32: 1534 case AMDGPU::S_MOV_B64: 1535 case AMDGPU::COPY: 1536 return true; 1537 default: 1538 return false; 1539 } 1540 } 1541 1542 static void removeModOperands(MachineInstr &MI) { 1543 unsigned Opc = MI.getOpcode(); 1544 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1545 AMDGPU::OpName::src0_modifiers); 1546 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1547 AMDGPU::OpName::src1_modifiers); 1548 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1549 AMDGPU::OpName::src2_modifiers); 1550 1551 MI.RemoveOperand(Src2ModIdx); 1552 MI.RemoveOperand(Src1ModIdx); 1553 MI.RemoveOperand(Src0ModIdx); 1554 } 1555 1556 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1557 unsigned Reg, MachineRegisterInfo *MRI) const { 1558 if (!MRI->hasOneNonDBGUse(Reg)) 1559 return false; 1560 1561 unsigned Opc = UseMI.getOpcode(); 1562 if (Opc == AMDGPU::COPY) { 1563 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1564 switch (DefMI.getOpcode()) { 1565 default: 1566 return false; 1567 case AMDGPU::S_MOV_B64: 1568 // TODO: We could fold 64-bit immediates, but this get compilicated 1569 // when there are sub-registers. 1570 return false; 1571 1572 case AMDGPU::V_MOV_B32_e32: 1573 case AMDGPU::S_MOV_B32: 1574 break; 1575 } 1576 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1577 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1578 assert(ImmOp); 1579 // FIXME: We could handle FrameIndex values here. 1580 if (!ImmOp->isImm()) { 1581 return false; 1582 } 1583 UseMI.setDesc(get(NewOpc)); 1584 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1585 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1586 return true; 1587 } 1588 1589 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1590 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1591 // Don't fold if we are using source or output modifiers. The new VOP2 1592 // instructions don't have them. 1593 if (hasAnyModifiersSet(UseMI)) 1594 return false; 1595 1596 const MachineOperand &ImmOp = DefMI.getOperand(1); 1597 1598 // If this is a free constant, there's no reason to do this. 1599 // TODO: We could fold this here instead of letting SIFoldOperands do it 1600 // later. 1601 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1602 1603 // Any src operand can be used for the legality check. 1604 if (isInlineConstant(UseMI, *Src0, ImmOp)) 1605 return false; 1606 1607 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1608 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1609 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1610 1611 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1612 // We should only expect these to be on src0 due to canonicalizations. 1613 if (Src0->isReg() && Src0->getReg() == Reg) { 1614 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1615 return false; 1616 1617 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1618 return false; 1619 1620 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1621 1622 const int64_t Imm = DefMI.getOperand(1).getImm(); 1623 1624 // FIXME: This would be a lot easier if we could return a new instruction 1625 // instead of having to modify in place. 1626 1627 // Remove these first since they are at the end. 1628 UseMI.RemoveOperand( 1629 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1630 UseMI.RemoveOperand( 1631 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1632 1633 unsigned Src1Reg = Src1->getReg(); 1634 unsigned Src1SubReg = Src1->getSubReg(); 1635 Src0->setReg(Src1Reg); 1636 Src0->setSubReg(Src1SubReg); 1637 Src0->setIsKill(Src1->isKill()); 1638 1639 if (Opc == AMDGPU::V_MAC_F32_e64 || 1640 Opc == AMDGPU::V_MAC_F16_e64) 1641 UseMI.untieRegOperand( 1642 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1643 1644 Src1->ChangeToImmediate(Imm); 1645 1646 removeModOperands(UseMI); 1647 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 1648 1649 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1650 if (DeleteDef) 1651 DefMI.eraseFromParent(); 1652 1653 return true; 1654 } 1655 1656 // Added part is the constant: Use v_madak_{f16, f32}. 1657 if (Src2->isReg() && Src2->getReg() == Reg) { 1658 // Not allowed to use constant bus for another operand. 1659 // We can however allow an inline immediate as src0. 1660 if (!Src0->isImm() && 1661 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1662 return false; 1663 1664 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1665 return false; 1666 1667 const int64_t Imm = DefMI.getOperand(1).getImm(); 1668 1669 // FIXME: This would be a lot easier if we could return a new instruction 1670 // instead of having to modify in place. 1671 1672 // Remove these first since they are at the end. 1673 UseMI.RemoveOperand( 1674 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1675 UseMI.RemoveOperand( 1676 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1677 1678 if (Opc == AMDGPU::V_MAC_F32_e64 || 1679 Opc == AMDGPU::V_MAC_F16_e64) 1680 UseMI.untieRegOperand( 1681 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1682 1683 // ChangingToImmediate adds Src2 back to the instruction. 1684 Src2->ChangeToImmediate(Imm); 1685 1686 // These come before src2. 1687 removeModOperands(UseMI); 1688 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 1689 1690 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1691 if (DeleteDef) 1692 DefMI.eraseFromParent(); 1693 1694 return true; 1695 } 1696 } 1697 1698 return false; 1699 } 1700 1701 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1702 int WidthB, int OffsetB) { 1703 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1704 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1705 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1706 return LowOffset + LowWidth <= HighOffset; 1707 } 1708 1709 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1710 MachineInstr &MIb) const { 1711 unsigned BaseReg0, BaseReg1; 1712 int64_t Offset0, Offset1; 1713 1714 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1715 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1716 1717 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1718 // FIXME: Handle ds_read2 / ds_write2. 1719 return false; 1720 } 1721 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1722 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1723 if (BaseReg0 == BaseReg1 && 1724 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1725 return true; 1726 } 1727 } 1728 1729 return false; 1730 } 1731 1732 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1733 MachineInstr &MIb, 1734 AliasAnalysis *AA) const { 1735 assert((MIa.mayLoad() || MIa.mayStore()) && 1736 "MIa must load from or modify a memory location"); 1737 assert((MIb.mayLoad() || MIb.mayStore()) && 1738 "MIb must load from or modify a memory location"); 1739 1740 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1741 return false; 1742 1743 // XXX - Can we relax this between address spaces? 1744 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1745 return false; 1746 1747 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 1748 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 1749 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 1750 if (MMOa->getValue() && MMOb->getValue()) { 1751 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 1752 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 1753 if (!AA->alias(LocA, LocB)) 1754 return true; 1755 } 1756 } 1757 1758 // TODO: Should we check the address space from the MachineMemOperand? That 1759 // would allow us to distinguish objects we know don't alias based on the 1760 // underlying address space, even if it was lowered to a different one, 1761 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1762 // buffer. 1763 if (isDS(MIa)) { 1764 if (isDS(MIb)) 1765 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1766 1767 return !isFLAT(MIb); 1768 } 1769 1770 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1771 if (isMUBUF(MIb) || isMTBUF(MIb)) 1772 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1773 1774 return !isFLAT(MIb) && !isSMRD(MIb); 1775 } 1776 1777 if (isSMRD(MIa)) { 1778 if (isSMRD(MIb)) 1779 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1780 1781 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 1782 } 1783 1784 if (isFLAT(MIa)) { 1785 if (isFLAT(MIb)) 1786 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1787 1788 return false; 1789 } 1790 1791 return false; 1792 } 1793 1794 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1795 MachineInstr &MI, 1796 LiveVariables *LV) const { 1797 bool IsF16 = false; 1798 1799 switch (MI.getOpcode()) { 1800 default: 1801 return nullptr; 1802 case AMDGPU::V_MAC_F16_e64: 1803 IsF16 = true; 1804 case AMDGPU::V_MAC_F32_e64: 1805 break; 1806 case AMDGPU::V_MAC_F16_e32: 1807 IsF16 = true; 1808 case AMDGPU::V_MAC_F32_e32: { 1809 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1810 AMDGPU::OpName::src0); 1811 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 1812 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 1813 return nullptr; 1814 break; 1815 } 1816 } 1817 1818 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 1819 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1820 const MachineOperand *Src0Mods = 1821 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 1822 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 1823 const MachineOperand *Src1Mods = 1824 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 1825 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 1826 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 1827 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 1828 1829 return BuildMI(*MBB, MI, MI.getDebugLoc(), 1830 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) 1831 .add(*Dst) 1832 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 1833 .add(*Src0) 1834 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 1835 .add(*Src1) 1836 .addImm(0) // Src mods 1837 .add(*Src2) 1838 .addImm(Clamp ? Clamp->getImm() : 0) 1839 .addImm(Omod ? Omod->getImm() : 0); 1840 } 1841 1842 // It's not generally safe to move VALU instructions across these since it will 1843 // start using the register as a base index rather than directly. 1844 // XXX - Why isn't hasSideEffects sufficient for these? 1845 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 1846 switch (MI.getOpcode()) { 1847 case AMDGPU::S_SET_GPR_IDX_ON: 1848 case AMDGPU::S_SET_GPR_IDX_MODE: 1849 case AMDGPU::S_SET_GPR_IDX_OFF: 1850 return true; 1851 default: 1852 return false; 1853 } 1854 } 1855 1856 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1857 const MachineBasicBlock *MBB, 1858 const MachineFunction &MF) const { 1859 // XXX - Do we want the SP check in the base implementation? 1860 1861 // Target-independent instructions do not have an implicit-use of EXEC, even 1862 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1863 // boundaries prevents incorrect movements of such instructions. 1864 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 1865 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 1866 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 1867 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 1868 changesVGPRIndexingMode(MI); 1869 } 1870 1871 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1872 switch (Imm.getBitWidth()) { 1873 case 32: 1874 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 1875 ST.hasInv2PiInlineImm()); 1876 case 64: 1877 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 1878 ST.hasInv2PiInlineImm()); 1879 case 16: 1880 return ST.has16BitInsts() && 1881 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 1882 ST.hasInv2PiInlineImm()); 1883 default: 1884 llvm_unreachable("invalid bitwidth"); 1885 } 1886 } 1887 1888 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1889 uint8_t OperandType) const { 1890 if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) 1891 return false; 1892 1893 // MachineOperand provides no way to tell the true operand size, since it only 1894 // records a 64-bit value. We need to know the size to determine if a 32-bit 1895 // floating point immediate bit pattern is legal for an integer immediate. It 1896 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 1897 1898 int64_t Imm = MO.getImm(); 1899 switch (OperandType) { 1900 case AMDGPU::OPERAND_REG_IMM_INT32: 1901 case AMDGPU::OPERAND_REG_IMM_FP32: 1902 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 1903 case AMDGPU::OPERAND_REG_INLINE_C_FP32: { 1904 int32_t Trunc = static_cast<int32_t>(Imm); 1905 return Trunc == Imm && 1906 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 1907 } 1908 case AMDGPU::OPERAND_REG_IMM_INT64: 1909 case AMDGPU::OPERAND_REG_IMM_FP64: 1910 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 1911 case AMDGPU::OPERAND_REG_INLINE_C_FP64: { 1912 return AMDGPU::isInlinableLiteral64(MO.getImm(), 1913 ST.hasInv2PiInlineImm()); 1914 } 1915 case AMDGPU::OPERAND_REG_IMM_INT16: 1916 case AMDGPU::OPERAND_REG_IMM_FP16: 1917 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 1918 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 1919 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 1920 // A few special case instructions have 16-bit operands on subtargets 1921 // where 16-bit instructions are not legal. 1922 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 1923 // constants in these cases 1924 int16_t Trunc = static_cast<int16_t>(Imm); 1925 return ST.has16BitInsts() && 1926 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 1927 } 1928 1929 return false; 1930 } 1931 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 1932 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { 1933 uint32_t Trunc = static_cast<uint32_t>(Imm); 1934 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 1935 } 1936 default: 1937 llvm_unreachable("invalid bitwidth"); 1938 } 1939 } 1940 1941 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 1942 const MCOperandInfo &OpInfo) const { 1943 switch (MO.getType()) { 1944 case MachineOperand::MO_Register: 1945 return false; 1946 case MachineOperand::MO_Immediate: 1947 return !isInlineConstant(MO, OpInfo); 1948 case MachineOperand::MO_FrameIndex: 1949 case MachineOperand::MO_MachineBasicBlock: 1950 case MachineOperand::MO_ExternalSymbol: 1951 case MachineOperand::MO_GlobalAddress: 1952 case MachineOperand::MO_MCSymbol: 1953 return true; 1954 default: 1955 llvm_unreachable("unexpected operand type"); 1956 } 1957 } 1958 1959 static bool compareMachineOp(const MachineOperand &Op0, 1960 const MachineOperand &Op1) { 1961 if (Op0.getType() != Op1.getType()) 1962 return false; 1963 1964 switch (Op0.getType()) { 1965 case MachineOperand::MO_Register: 1966 return Op0.getReg() == Op1.getReg(); 1967 case MachineOperand::MO_Immediate: 1968 return Op0.getImm() == Op1.getImm(); 1969 default: 1970 llvm_unreachable("Didn't expect to be comparing these operand types"); 1971 } 1972 } 1973 1974 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 1975 const MachineOperand &MO) const { 1976 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 1977 1978 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1979 1980 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1981 return true; 1982 1983 if (OpInfo.RegClass < 0) 1984 return false; 1985 1986 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 1987 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1988 1989 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1990 } 1991 1992 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1993 int Op32 = AMDGPU::getVOPe32(Opcode); 1994 if (Op32 == -1) 1995 return false; 1996 1997 return pseudoToMCOpcode(Op32) != -1; 1998 } 1999 2000 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2001 // The src0_modifier operand is present on all instructions 2002 // that have modifiers. 2003 2004 return AMDGPU::getNamedOperandIdx(Opcode, 2005 AMDGPU::OpName::src0_modifiers) != -1; 2006 } 2007 2008 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2009 unsigned OpName) const { 2010 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2011 return Mods && Mods->getImm(); 2012 } 2013 2014 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2015 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2016 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2017 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2018 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2019 hasModifiersSet(MI, AMDGPU::OpName::omod); 2020 } 2021 2022 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 2023 const MachineOperand &MO, 2024 const MCOperandInfo &OpInfo) const { 2025 // Literal constants use the constant bus. 2026 //if (isLiteralConstantLike(MO, OpInfo)) 2027 // return true; 2028 if (MO.isImm()) 2029 return !isInlineConstant(MO, OpInfo); 2030 2031 if (!MO.isReg()) 2032 return true; // Misc other operands like FrameIndex 2033 2034 if (!MO.isUse()) 2035 return false; 2036 2037 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 2038 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 2039 2040 // FLAT_SCR is just an SGPR pair. 2041 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 2042 return true; 2043 2044 // EXEC register uses the constant bus. 2045 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 2046 return true; 2047 2048 // SGPRs use the constant bus 2049 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 2050 (!MO.isImplicit() && 2051 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 2052 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 2053 } 2054 2055 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 2056 for (const MachineOperand &MO : MI.implicit_operands()) { 2057 // We only care about reads. 2058 if (MO.isDef()) 2059 continue; 2060 2061 switch (MO.getReg()) { 2062 case AMDGPU::VCC: 2063 case AMDGPU::M0: 2064 case AMDGPU::FLAT_SCR: 2065 return MO.getReg(); 2066 2067 default: 2068 break; 2069 } 2070 } 2071 2072 return AMDGPU::NoRegister; 2073 } 2074 2075 static bool shouldReadExec(const MachineInstr &MI) { 2076 if (SIInstrInfo::isVALU(MI)) { 2077 switch (MI.getOpcode()) { 2078 case AMDGPU::V_READLANE_B32: 2079 case AMDGPU::V_READLANE_B32_si: 2080 case AMDGPU::V_READLANE_B32_vi: 2081 case AMDGPU::V_WRITELANE_B32: 2082 case AMDGPU::V_WRITELANE_B32_si: 2083 case AMDGPU::V_WRITELANE_B32_vi: 2084 return false; 2085 } 2086 2087 return true; 2088 } 2089 2090 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2091 SIInstrInfo::isSALU(MI) || 2092 SIInstrInfo::isSMRD(MI)) 2093 return false; 2094 2095 return true; 2096 } 2097 2098 static bool isSubRegOf(const SIRegisterInfo &TRI, 2099 const MachineOperand &SuperVec, 2100 const MachineOperand &SubReg) { 2101 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2102 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2103 2104 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2105 SubReg.getReg() == SuperVec.getReg(); 2106 } 2107 2108 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2109 StringRef &ErrInfo) const { 2110 uint16_t Opcode = MI.getOpcode(); 2111 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2112 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2113 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2114 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2115 2116 // Make sure the number of operands is correct. 2117 const MCInstrDesc &Desc = get(Opcode); 2118 if (!Desc.isVariadic() && 2119 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2120 ErrInfo = "Instruction has wrong number of operands."; 2121 return false; 2122 } 2123 2124 if (MI.isInlineAsm()) { 2125 // Verify register classes for inlineasm constraints. 2126 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2127 I != E; ++I) { 2128 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2129 if (!RC) 2130 continue; 2131 2132 const MachineOperand &Op = MI.getOperand(I); 2133 if (!Op.isReg()) 2134 continue; 2135 2136 unsigned Reg = Op.getReg(); 2137 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2138 ErrInfo = "inlineasm operand has incorrect register class."; 2139 return false; 2140 } 2141 } 2142 2143 return true; 2144 } 2145 2146 // Make sure the register classes are correct. 2147 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2148 if (MI.getOperand(i).isFPImm()) { 2149 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2150 "all fp values to integers."; 2151 return false; 2152 } 2153 2154 int RegClass = Desc.OpInfo[i].RegClass; 2155 2156 switch (Desc.OpInfo[i].OperandType) { 2157 case MCOI::OPERAND_REGISTER: 2158 if (MI.getOperand(i).isImm()) { 2159 ErrInfo = "Illegal immediate value for operand."; 2160 return false; 2161 } 2162 break; 2163 case AMDGPU::OPERAND_REG_IMM_INT32: 2164 case AMDGPU::OPERAND_REG_IMM_FP32: 2165 break; 2166 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2167 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2168 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2169 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2170 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2171 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2172 const MachineOperand &MO = MI.getOperand(i); 2173 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2174 ErrInfo = "Illegal immediate value for operand."; 2175 return false; 2176 } 2177 break; 2178 } 2179 case MCOI::OPERAND_IMMEDIATE: 2180 case AMDGPU::OPERAND_KIMM32: 2181 // Check if this operand is an immediate. 2182 // FrameIndex operands will be replaced by immediates, so they are 2183 // allowed. 2184 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2185 ErrInfo = "Expected immediate, but got non-immediate"; 2186 return false; 2187 } 2188 LLVM_FALLTHROUGH; 2189 default: 2190 continue; 2191 } 2192 2193 if (!MI.getOperand(i).isReg()) 2194 continue; 2195 2196 if (RegClass != -1) { 2197 unsigned Reg = MI.getOperand(i).getReg(); 2198 if (Reg == AMDGPU::NoRegister || 2199 TargetRegisterInfo::isVirtualRegister(Reg)) 2200 continue; 2201 2202 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2203 if (!RC->contains(Reg)) { 2204 ErrInfo = "Operand has incorrect register class."; 2205 return false; 2206 } 2207 } 2208 } 2209 2210 // Verify VOP* 2211 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 2212 // Only look at the true operands. Only a real operand can use the constant 2213 // bus, and we don't want to check pseudo-operands like the source modifier 2214 // flags. 2215 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 2216 2217 unsigned ConstantBusCount = 0; 2218 2219 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 2220 ++ConstantBusCount; 2221 2222 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2223 if (SGPRUsed != AMDGPU::NoRegister) 2224 ++ConstantBusCount; 2225 2226 for (int OpIdx : OpIndices) { 2227 if (OpIdx == -1) 2228 break; 2229 const MachineOperand &MO = MI.getOperand(OpIdx); 2230 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 2231 if (MO.isReg()) { 2232 if (MO.getReg() != SGPRUsed) 2233 ++ConstantBusCount; 2234 SGPRUsed = MO.getReg(); 2235 } else { 2236 ++ConstantBusCount; 2237 } 2238 } 2239 } 2240 if (ConstantBusCount > 1) { 2241 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2242 return false; 2243 } 2244 } 2245 2246 // Verify misc. restrictions on specific instructions. 2247 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2248 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2249 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2250 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2251 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2252 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2253 if (!compareMachineOp(Src0, Src1) && 2254 !compareMachineOp(Src0, Src2)) { 2255 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2256 return false; 2257 } 2258 } 2259 } 2260 2261 if (isSOPK(MI)) { 2262 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2263 if (sopkIsZext(MI)) { 2264 if (!isUInt<16>(Imm)) { 2265 ErrInfo = "invalid immediate for SOPK instruction"; 2266 return false; 2267 } 2268 } else { 2269 if (!isInt<16>(Imm)) { 2270 ErrInfo = "invalid immediate for SOPK instruction"; 2271 return false; 2272 } 2273 } 2274 } 2275 2276 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2277 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2278 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2279 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2280 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2281 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2282 2283 const unsigned StaticNumOps = Desc.getNumOperands() + 2284 Desc.getNumImplicitUses(); 2285 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2286 2287 // Allow additional implicit operands. This allows a fixup done by the post 2288 // RA scheduler where the main implicit operand is killed and implicit-defs 2289 // are added for sub-registers that remain live after this instruction. 2290 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2291 ErrInfo = "missing implicit register operands"; 2292 return false; 2293 } 2294 2295 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2296 if (IsDst) { 2297 if (!Dst->isUse()) { 2298 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2299 return false; 2300 } 2301 2302 unsigned UseOpIdx; 2303 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2304 UseOpIdx != StaticNumOps + 1) { 2305 ErrInfo = "movrel implicit operands should be tied"; 2306 return false; 2307 } 2308 } 2309 2310 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2311 const MachineOperand &ImpUse 2312 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2313 if (!ImpUse.isReg() || !ImpUse.isUse() || 2314 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2315 ErrInfo = "src0 should be subreg of implicit vector use"; 2316 return false; 2317 } 2318 } 2319 2320 // Make sure we aren't losing exec uses in the td files. This mostly requires 2321 // being careful when using let Uses to try to add other use registers. 2322 if (shouldReadExec(MI)) { 2323 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2324 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2325 return false; 2326 } 2327 } 2328 2329 if (isSMRD(MI)) { 2330 if (MI.mayStore()) { 2331 // The register offset form of scalar stores may only use m0 as the 2332 // soffset register. 2333 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2334 if (Soff && Soff->getReg() != AMDGPU::M0) { 2335 ErrInfo = "scalar stores must use m0 as offset register"; 2336 return false; 2337 } 2338 } 2339 } 2340 2341 return true; 2342 } 2343 2344 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2345 switch (MI.getOpcode()) { 2346 default: return AMDGPU::INSTRUCTION_LIST_END; 2347 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2348 case AMDGPU::COPY: return AMDGPU::COPY; 2349 case AMDGPU::PHI: return AMDGPU::PHI; 2350 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2351 case AMDGPU::S_MOV_B32: 2352 return MI.getOperand(1).isReg() ? 2353 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2354 case AMDGPU::S_ADD_I32: 2355 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2356 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2357 case AMDGPU::S_SUB_I32: 2358 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2359 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2360 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2361 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2362 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2363 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2364 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2365 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2366 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2367 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2368 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2369 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2370 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2371 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2372 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2373 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2374 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2375 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2376 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2377 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2378 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2379 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2380 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2381 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2382 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2383 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2384 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2385 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2386 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2387 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2388 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2389 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2390 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2391 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2392 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2393 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2394 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2395 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2396 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2397 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2398 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2399 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2400 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2401 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2402 } 2403 } 2404 2405 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2406 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2407 } 2408 2409 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2410 unsigned OpNo) const { 2411 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2412 const MCInstrDesc &Desc = get(MI.getOpcode()); 2413 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2414 Desc.OpInfo[OpNo].RegClass == -1) { 2415 unsigned Reg = MI.getOperand(OpNo).getReg(); 2416 2417 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2418 return MRI.getRegClass(Reg); 2419 return RI.getPhysRegClass(Reg); 2420 } 2421 2422 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2423 return RI.getRegClass(RCID); 2424 } 2425 2426 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2427 switch (MI.getOpcode()) { 2428 case AMDGPU::COPY: 2429 case AMDGPU::REG_SEQUENCE: 2430 case AMDGPU::PHI: 2431 case AMDGPU::INSERT_SUBREG: 2432 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2433 default: 2434 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2435 } 2436 } 2437 2438 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2439 MachineBasicBlock::iterator I = MI; 2440 MachineBasicBlock *MBB = MI.getParent(); 2441 MachineOperand &MO = MI.getOperand(OpIdx); 2442 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2443 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2444 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2445 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2446 if (MO.isReg()) 2447 Opcode = AMDGPU::COPY; 2448 else if (RI.isSGPRClass(RC)) 2449 Opcode = AMDGPU::S_MOV_B32; 2450 2451 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2452 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2453 VRC = &AMDGPU::VReg_64RegClass; 2454 else 2455 VRC = &AMDGPU::VGPR_32RegClass; 2456 2457 unsigned Reg = MRI.createVirtualRegister(VRC); 2458 DebugLoc DL = MBB->findDebugLoc(I); 2459 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 2460 MO.ChangeToRegister(Reg, false); 2461 } 2462 2463 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2464 MachineRegisterInfo &MRI, 2465 MachineOperand &SuperReg, 2466 const TargetRegisterClass *SuperRC, 2467 unsigned SubIdx, 2468 const TargetRegisterClass *SubRC) 2469 const { 2470 MachineBasicBlock *MBB = MI->getParent(); 2471 DebugLoc DL = MI->getDebugLoc(); 2472 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2473 2474 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2475 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2476 .addReg(SuperReg.getReg(), 0, SubIdx); 2477 return SubReg; 2478 } 2479 2480 // Just in case the super register is itself a sub-register, copy it to a new 2481 // value so we don't need to worry about merging its subreg index with the 2482 // SubIdx passed to this function. The register coalescer should be able to 2483 // eliminate this extra copy. 2484 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2485 2486 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2487 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2488 2489 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2490 .addReg(NewSuperReg, 0, SubIdx); 2491 2492 return SubReg; 2493 } 2494 2495 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2496 MachineBasicBlock::iterator MII, 2497 MachineRegisterInfo &MRI, 2498 MachineOperand &Op, 2499 const TargetRegisterClass *SuperRC, 2500 unsigned SubIdx, 2501 const TargetRegisterClass *SubRC) const { 2502 if (Op.isImm()) { 2503 if (SubIdx == AMDGPU::sub0) 2504 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2505 if (SubIdx == AMDGPU::sub1) 2506 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2507 2508 llvm_unreachable("Unhandled register index for immediate"); 2509 } 2510 2511 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2512 SubIdx, SubRC); 2513 return MachineOperand::CreateReg(SubReg, false); 2514 } 2515 2516 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2517 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2518 assert(Inst.getNumExplicitOperands() == 3); 2519 MachineOperand Op1 = Inst.getOperand(1); 2520 Inst.RemoveOperand(1); 2521 Inst.addOperand(Op1); 2522 } 2523 2524 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2525 const MCOperandInfo &OpInfo, 2526 const MachineOperand &MO) const { 2527 if (!MO.isReg()) 2528 return false; 2529 2530 unsigned Reg = MO.getReg(); 2531 const TargetRegisterClass *RC = 2532 TargetRegisterInfo::isVirtualRegister(Reg) ? 2533 MRI.getRegClass(Reg) : 2534 RI.getPhysRegClass(Reg); 2535 2536 const SIRegisterInfo *TRI = 2537 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2538 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2539 2540 // In order to be legal, the common sub-class must be equal to the 2541 // class of the current operand. For example: 2542 // 2543 // v_mov_b32 s0 ; Operand defined as vsrc_b32 2544 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 2545 // 2546 // s_sendmsg 0, s0 ; Operand defined as m0reg 2547 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2548 2549 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2550 } 2551 2552 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2553 const MCOperandInfo &OpInfo, 2554 const MachineOperand &MO) const { 2555 if (MO.isReg()) 2556 return isLegalRegOperand(MRI, OpInfo, MO); 2557 2558 // Handle non-register types that are treated like immediates. 2559 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2560 return true; 2561 } 2562 2563 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2564 const MachineOperand *MO) const { 2565 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2566 const MCInstrDesc &InstDesc = MI.getDesc(); 2567 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2568 const TargetRegisterClass *DefinedRC = 2569 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2570 if (!MO) 2571 MO = &MI.getOperand(OpIdx); 2572 2573 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 2574 2575 RegSubRegPair SGPRUsed; 2576 if (MO->isReg()) 2577 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2578 2579 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2580 if (i == OpIdx) 2581 continue; 2582 const MachineOperand &Op = MI.getOperand(i); 2583 if (Op.isReg()) { 2584 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2585 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 2586 return false; 2587 } 2588 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2589 return false; 2590 } 2591 } 2592 } 2593 2594 if (MO->isReg()) { 2595 assert(DefinedRC); 2596 return isLegalRegOperand(MRI, OpInfo, *MO); 2597 } 2598 2599 // Handle non-register types that are treated like immediates. 2600 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2601 2602 if (!DefinedRC) { 2603 // This operand expects an immediate. 2604 return true; 2605 } 2606 2607 return isImmOperandLegal(MI, OpIdx, *MO); 2608 } 2609 2610 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2611 MachineInstr &MI) const { 2612 unsigned Opc = MI.getOpcode(); 2613 const MCInstrDesc &InstrDesc = get(Opc); 2614 2615 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2616 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2617 2618 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2619 // we need to only have one constant bus use. 2620 // 2621 // Note we do not need to worry about literal constants here. They are 2622 // disabled for the operand type for instructions because they will always 2623 // violate the one constant bus use rule. 2624 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2625 if (HasImplicitSGPR) { 2626 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2627 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2628 2629 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2630 legalizeOpWithMove(MI, Src0Idx); 2631 } 2632 2633 // VOP2 src0 instructions support all operand types, so we don't need to check 2634 // their legality. If src1 is already legal, we don't need to do anything. 2635 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2636 return; 2637 2638 // We do not use commuteInstruction here because it is too aggressive and will 2639 // commute if it is possible. We only want to commute here if it improves 2640 // legality. This can be called a fairly large number of times so don't waste 2641 // compile time pointlessly swapping and checking legality again. 2642 if (HasImplicitSGPR || !MI.isCommutable()) { 2643 legalizeOpWithMove(MI, Src1Idx); 2644 return; 2645 } 2646 2647 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2648 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2649 2650 // If src0 can be used as src1, commuting will make the operands legal. 2651 // Otherwise we have to give up and insert a move. 2652 // 2653 // TODO: Other immediate-like operand kinds could be commuted if there was a 2654 // MachineOperand::ChangeTo* for them. 2655 if ((!Src1.isImm() && !Src1.isReg()) || 2656 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2657 legalizeOpWithMove(MI, Src1Idx); 2658 return; 2659 } 2660 2661 int CommutedOpc = commuteOpcode(MI); 2662 if (CommutedOpc == -1) { 2663 legalizeOpWithMove(MI, Src1Idx); 2664 return; 2665 } 2666 2667 MI.setDesc(get(CommutedOpc)); 2668 2669 unsigned Src0Reg = Src0.getReg(); 2670 unsigned Src0SubReg = Src0.getSubReg(); 2671 bool Src0Kill = Src0.isKill(); 2672 2673 if (Src1.isImm()) 2674 Src0.ChangeToImmediate(Src1.getImm()); 2675 else if (Src1.isReg()) { 2676 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2677 Src0.setSubReg(Src1.getSubReg()); 2678 } else 2679 llvm_unreachable("Should only have register or immediate operands"); 2680 2681 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2682 Src1.setSubReg(Src0SubReg); 2683 } 2684 2685 // Legalize VOP3 operands. Because all operand types are supported for any 2686 // operand, and since literal constants are not allowed and should never be 2687 // seen, we only need to worry about inserting copies if we use multiple SGPR 2688 // operands. 2689 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2690 MachineInstr &MI) const { 2691 unsigned Opc = MI.getOpcode(); 2692 2693 int VOP3Idx[3] = { 2694 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2695 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2696 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2697 }; 2698 2699 // Find the one SGPR operand we are allowed to use. 2700 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2701 2702 for (unsigned i = 0; i < 3; ++i) { 2703 int Idx = VOP3Idx[i]; 2704 if (Idx == -1) 2705 break; 2706 MachineOperand &MO = MI.getOperand(Idx); 2707 2708 // We should never see a VOP3 instruction with an illegal immediate operand. 2709 if (!MO.isReg()) 2710 continue; 2711 2712 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2713 continue; // VGPRs are legal 2714 2715 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2716 SGPRReg = MO.getReg(); 2717 // We can use one SGPR in each VOP3 instruction. 2718 continue; 2719 } 2720 2721 // If we make it this far, then the operand is not legal and we must 2722 // legalize it. 2723 legalizeOpWithMove(MI, Idx); 2724 } 2725 } 2726 2727 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2728 MachineRegisterInfo &MRI) const { 2729 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2730 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2731 unsigned DstReg = MRI.createVirtualRegister(SRC); 2732 unsigned SubRegs = VRC->getSize() / 4; 2733 2734 SmallVector<unsigned, 8> SRegs; 2735 for (unsigned i = 0; i < SubRegs; ++i) { 2736 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2737 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2738 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2739 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2740 SRegs.push_back(SGPR); 2741 } 2742 2743 MachineInstrBuilder MIB = 2744 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2745 get(AMDGPU::REG_SEQUENCE), DstReg); 2746 for (unsigned i = 0; i < SubRegs; ++i) { 2747 MIB.addReg(SRegs[i]); 2748 MIB.addImm(RI.getSubRegFromChannel(i)); 2749 } 2750 return DstReg; 2751 } 2752 2753 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2754 MachineInstr &MI) const { 2755 2756 // If the pointer is store in VGPRs, then we need to move them to 2757 // SGPRs using v_readfirstlane. This is safe because we only select 2758 // loads with uniform pointers to SMRD instruction so we know the 2759 // pointer value is uniform. 2760 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 2761 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2762 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2763 SBase->setReg(SGPR); 2764 } 2765 } 2766 2767 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 2768 MachineBasicBlock::iterator I, 2769 const TargetRegisterClass *DstRC, 2770 MachineOperand &Op, 2771 MachineRegisterInfo &MRI, 2772 const DebugLoc &DL) const { 2773 2774 unsigned OpReg = Op.getReg(); 2775 unsigned OpSubReg = Op.getSubReg(); 2776 2777 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 2778 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 2779 2780 // Check if operand is already the correct register class. 2781 if (DstRC == OpRC) 2782 return; 2783 2784 unsigned DstReg = MRI.createVirtualRegister(DstRC); 2785 MachineInstr *Copy = 2786 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 2787 2788 Op.setReg(DstReg); 2789 Op.setSubReg(0); 2790 2791 MachineInstr *Def = MRI.getVRegDef(OpReg); 2792 if (!Def) 2793 return; 2794 2795 // Try to eliminate the copy if it is copying an immediate value. 2796 if (Def->isMoveImmediate()) 2797 FoldImmediate(*Copy, *Def, OpReg, &MRI); 2798 } 2799 2800 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 2801 MachineFunction &MF = *MI.getParent()->getParent(); 2802 MachineRegisterInfo &MRI = MF.getRegInfo(); 2803 2804 // Legalize VOP2 2805 if (isVOP2(MI) || isVOPC(MI)) { 2806 legalizeOperandsVOP2(MRI, MI); 2807 return; 2808 } 2809 2810 // Legalize VOP3 2811 if (isVOP3(MI)) { 2812 legalizeOperandsVOP3(MRI, MI); 2813 return; 2814 } 2815 2816 // Legalize SMRD 2817 if (isSMRD(MI)) { 2818 legalizeOperandsSMRD(MRI, MI); 2819 return; 2820 } 2821 2822 // Legalize REG_SEQUENCE and PHI 2823 // The register class of the operands much be the same type as the register 2824 // class of the output. 2825 if (MI.getOpcode() == AMDGPU::PHI) { 2826 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2827 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 2828 if (!MI.getOperand(i).isReg() || 2829 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 2830 continue; 2831 const TargetRegisterClass *OpRC = 2832 MRI.getRegClass(MI.getOperand(i).getReg()); 2833 if (RI.hasVGPRs(OpRC)) { 2834 VRC = OpRC; 2835 } else { 2836 SRC = OpRC; 2837 } 2838 } 2839 2840 // If any of the operands are VGPR registers, then they all most be 2841 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2842 // them. 2843 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 2844 if (!VRC) { 2845 assert(SRC); 2846 VRC = RI.getEquivalentVGPRClass(SRC); 2847 } 2848 RC = VRC; 2849 } else { 2850 RC = SRC; 2851 } 2852 2853 // Update all the operands so they have the same type. 2854 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2855 MachineOperand &Op = MI.getOperand(I); 2856 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2857 continue; 2858 2859 // MI is a PHI instruction. 2860 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 2861 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2862 2863 // Avoid creating no-op copies with the same src and dst reg class. These 2864 // confuse some of the machine passes. 2865 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 2866 } 2867 } 2868 2869 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2870 // VGPR dest type and SGPR sources, insert copies so all operands are 2871 // VGPRs. This seems to help operand folding / the register coalescer. 2872 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 2873 MachineBasicBlock *MBB = MI.getParent(); 2874 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 2875 if (RI.hasVGPRs(DstRC)) { 2876 // Update all the operands so they are VGPR register classes. These may 2877 // not be the same register class because REG_SEQUENCE supports mixing 2878 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2879 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2880 MachineOperand &Op = MI.getOperand(I); 2881 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2882 continue; 2883 2884 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2885 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2886 if (VRC == OpRC) 2887 continue; 2888 2889 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 2890 Op.setIsKill(); 2891 } 2892 } 2893 2894 return; 2895 } 2896 2897 // Legalize INSERT_SUBREG 2898 // src0 must have the same register class as dst 2899 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 2900 unsigned Dst = MI.getOperand(0).getReg(); 2901 unsigned Src0 = MI.getOperand(1).getReg(); 2902 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2903 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2904 if (DstRC != Src0RC) { 2905 MachineBasicBlock *MBB = MI.getParent(); 2906 MachineOperand &Op = MI.getOperand(1); 2907 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 2908 } 2909 return; 2910 } 2911 2912 // Legalize MIMG and MUBUF/MTBUF for shaders. 2913 // 2914 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 2915 // scratch memory access. In both cases, the legalization never involves 2916 // conversion to the addr64 form. 2917 if (isMIMG(MI) || 2918 (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && 2919 (isMUBUF(MI) || isMTBUF(MI)))) { 2920 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 2921 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2922 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2923 SRsrc->setReg(SGPR); 2924 } 2925 2926 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 2927 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2928 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2929 SSamp->setReg(SGPR); 2930 } 2931 return; 2932 } 2933 2934 // Legalize MUBUF* instructions by converting to addr64 form. 2935 // FIXME: If we start using the non-addr64 instructions for compute, we 2936 // may need to legalize them as above. This especially applies to the 2937 // buffer_load_format_* variants and variants with idxen (or bothen). 2938 int SRsrcIdx = 2939 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 2940 if (SRsrcIdx != -1) { 2941 // We have an MUBUF instruction 2942 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 2943 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 2944 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2945 RI.getRegClass(SRsrcRC))) { 2946 // The operands are legal. 2947 // FIXME: We may need to legalize operands besided srsrc. 2948 return; 2949 } 2950 2951 MachineBasicBlock &MBB = *MI.getParent(); 2952 2953 // Extract the ptr from the resource descriptor. 2954 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2955 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2956 2957 // Create an empty resource descriptor 2958 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2959 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2960 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2961 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2962 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2963 2964 // Zero64 = 0 2965 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 2966 .addImm(0); 2967 2968 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2969 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 2970 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2971 2972 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2973 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 2974 .addImm(RsrcDataFormat >> 32); 2975 2976 // NewSRsrc = {Zero64, SRsrcFormat} 2977 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2978 .addReg(Zero64) 2979 .addImm(AMDGPU::sub0_sub1) 2980 .addReg(SRsrcFormatLo) 2981 .addImm(AMDGPU::sub2) 2982 .addReg(SRsrcFormatHi) 2983 .addImm(AMDGPU::sub3); 2984 2985 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 2986 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2987 if (VAddr) { 2988 // This is already an ADDR64 instruction so we need to add the pointer 2989 // extracted from the resource descriptor to the current value of VAddr. 2990 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2991 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2992 2993 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2994 DebugLoc DL = MI.getDebugLoc(); 2995 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2996 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2997 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2998 2999 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 3000 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 3001 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3002 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 3003 3004 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3005 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 3006 .addReg(NewVAddrLo) 3007 .addImm(AMDGPU::sub0) 3008 .addReg(NewVAddrHi) 3009 .addImm(AMDGPU::sub1); 3010 } else { 3011 // This instructions is the _OFFSET variant, so we need to convert it to 3012 // ADDR64. 3013 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 3014 < SISubtarget::VOLCANIC_ISLANDS && 3015 "FIXME: Need to emit flat atomics here"); 3016 3017 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 3018 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3019 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 3020 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 3021 3022 // Atomics rith return have have an additional tied operand and are 3023 // missing some of the special bits. 3024 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 3025 MachineInstr *Addr64; 3026 3027 if (!VDataIn) { 3028 // Regular buffer load / store. 3029 MachineInstrBuilder MIB = 3030 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3031 .add(*VData) 3032 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3033 // This will be replaced later 3034 // with the new value of vaddr. 3035 .add(*SRsrc) 3036 .add(*SOffset) 3037 .add(*Offset); 3038 3039 // Atomics do not have this operand. 3040 if (const MachineOperand *GLC = 3041 getNamedOperand(MI, AMDGPU::OpName::glc)) { 3042 MIB.addImm(GLC->getImm()); 3043 } 3044 3045 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 3046 3047 if (const MachineOperand *TFE = 3048 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 3049 MIB.addImm(TFE->getImm()); 3050 } 3051 3052 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3053 Addr64 = MIB; 3054 } else { 3055 // Atomics with return. 3056 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3057 .add(*VData) 3058 .add(*VDataIn) 3059 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3060 // This will be replaced later 3061 // with the new value of vaddr. 3062 .add(*SRsrc) 3063 .add(*SOffset) 3064 .add(*Offset) 3065 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 3066 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3067 } 3068 3069 MI.removeFromParent(); 3070 3071 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3072 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 3073 NewVAddr) 3074 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3075 .addImm(AMDGPU::sub0) 3076 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3077 .addImm(AMDGPU::sub1); 3078 3079 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 3080 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 3081 } 3082 3083 // Update the instruction to use NewVaddr 3084 VAddr->setReg(NewVAddr); 3085 // Update the instruction to use NewSRsrc 3086 SRsrc->setReg(NewSRsrc); 3087 } 3088 } 3089 3090 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 3091 SmallVector<MachineInstr *, 128> Worklist; 3092 Worklist.push_back(&TopInst); 3093 3094 while (!Worklist.empty()) { 3095 MachineInstr &Inst = *Worklist.pop_back_val(); 3096 MachineBasicBlock *MBB = Inst.getParent(); 3097 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3098 3099 unsigned Opcode = Inst.getOpcode(); 3100 unsigned NewOpcode = getVALUOp(Inst); 3101 3102 // Handle some special cases 3103 switch (Opcode) { 3104 default: 3105 break; 3106 case AMDGPU::S_AND_B64: 3107 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 3108 Inst.eraseFromParent(); 3109 continue; 3110 3111 case AMDGPU::S_OR_B64: 3112 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 3113 Inst.eraseFromParent(); 3114 continue; 3115 3116 case AMDGPU::S_XOR_B64: 3117 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 3118 Inst.eraseFromParent(); 3119 continue; 3120 3121 case AMDGPU::S_NOT_B64: 3122 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 3123 Inst.eraseFromParent(); 3124 continue; 3125 3126 case AMDGPU::S_BCNT1_I32_B64: 3127 splitScalar64BitBCNT(Worklist, Inst); 3128 Inst.eraseFromParent(); 3129 continue; 3130 3131 case AMDGPU::S_BFE_I64: { 3132 splitScalar64BitBFE(Worklist, Inst); 3133 Inst.eraseFromParent(); 3134 continue; 3135 } 3136 3137 case AMDGPU::S_LSHL_B32: 3138 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3139 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 3140 swapOperands(Inst); 3141 } 3142 break; 3143 case AMDGPU::S_ASHR_I32: 3144 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3145 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 3146 swapOperands(Inst); 3147 } 3148 break; 3149 case AMDGPU::S_LSHR_B32: 3150 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3151 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 3152 swapOperands(Inst); 3153 } 3154 break; 3155 case AMDGPU::S_LSHL_B64: 3156 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3157 NewOpcode = AMDGPU::V_LSHLREV_B64; 3158 swapOperands(Inst); 3159 } 3160 break; 3161 case AMDGPU::S_ASHR_I64: 3162 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3163 NewOpcode = AMDGPU::V_ASHRREV_I64; 3164 swapOperands(Inst); 3165 } 3166 break; 3167 case AMDGPU::S_LSHR_B64: 3168 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3169 NewOpcode = AMDGPU::V_LSHRREV_B64; 3170 swapOperands(Inst); 3171 } 3172 break; 3173 3174 case AMDGPU::S_ABS_I32: 3175 lowerScalarAbs(Worklist, Inst); 3176 Inst.eraseFromParent(); 3177 continue; 3178 3179 case AMDGPU::S_CBRANCH_SCC0: 3180 case AMDGPU::S_CBRANCH_SCC1: 3181 // Clear unused bits of vcc 3182 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 3183 AMDGPU::VCC) 3184 .addReg(AMDGPU::EXEC) 3185 .addReg(AMDGPU::VCC); 3186 break; 3187 3188 case AMDGPU::S_BFE_U64: 3189 case AMDGPU::S_BFM_B64: 3190 llvm_unreachable("Moving this op to VALU not implemented"); 3191 3192 case AMDGPU::S_PACK_LL_B32_B16: 3193 case AMDGPU::S_PACK_LH_B32_B16: 3194 case AMDGPU::S_PACK_HH_B32_B16: { 3195 movePackToVALU(Worklist, MRI, Inst); 3196 Inst.eraseFromParent(); 3197 continue; 3198 } 3199 } 3200 3201 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 3202 // We cannot move this instruction to the VALU, so we should try to 3203 // legalize its operands instead. 3204 legalizeOperands(Inst); 3205 continue; 3206 } 3207 3208 // Use the new VALU Opcode. 3209 const MCInstrDesc &NewDesc = get(NewOpcode); 3210 Inst.setDesc(NewDesc); 3211 3212 // Remove any references to SCC. Vector instructions can't read from it, and 3213 // We're just about to add the implicit use / defs of VCC, and we don't want 3214 // both. 3215 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 3216 MachineOperand &Op = Inst.getOperand(i); 3217 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 3218 Inst.RemoveOperand(i); 3219 addSCCDefUsersToVALUWorklist(Inst, Worklist); 3220 } 3221 } 3222 3223 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 3224 // We are converting these to a BFE, so we need to add the missing 3225 // operands for the size and offset. 3226 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 3227 Inst.addOperand(MachineOperand::CreateImm(0)); 3228 Inst.addOperand(MachineOperand::CreateImm(Size)); 3229 3230 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 3231 // The VALU version adds the second operand to the result, so insert an 3232 // extra 0 operand. 3233 Inst.addOperand(MachineOperand::CreateImm(0)); 3234 } 3235 3236 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 3237 3238 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 3239 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 3240 // If we need to move this to VGPRs, we need to unpack the second operand 3241 // back into the 2 separate ones for bit offset and width. 3242 assert(OffsetWidthOp.isImm() && 3243 "Scalar BFE is only implemented for constant width and offset"); 3244 uint32_t Imm = OffsetWidthOp.getImm(); 3245 3246 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3247 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3248 Inst.RemoveOperand(2); // Remove old immediate. 3249 Inst.addOperand(MachineOperand::CreateImm(Offset)); 3250 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 3251 } 3252 3253 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 3254 unsigned NewDstReg = AMDGPU::NoRegister; 3255 if (HasDst) { 3256 unsigned DstReg = Inst.getOperand(0).getReg(); 3257 if (TargetRegisterInfo::isPhysicalRegister(DstReg)) 3258 continue; 3259 3260 // Update the destination register class. 3261 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 3262 if (!NewDstRC) 3263 continue; 3264 3265 if (Inst.isCopy() && 3266 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 3267 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 3268 // Instead of creating a copy where src and dst are the same register 3269 // class, we just replace all uses of dst with src. These kinds of 3270 // copies interfere with the heuristics MachineSink uses to decide 3271 // whether or not to split a critical edge. Since the pass assumes 3272 // that copies will end up as machine instructions and not be 3273 // eliminated. 3274 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 3275 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 3276 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 3277 Inst.getOperand(0).setReg(DstReg); 3278 continue; 3279 } 3280 3281 NewDstReg = MRI.createVirtualRegister(NewDstRC); 3282 MRI.replaceRegWith(DstReg, NewDstReg); 3283 } 3284 3285 // Legalize the operands 3286 legalizeOperands(Inst); 3287 3288 if (HasDst) 3289 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3290 } 3291 } 3292 3293 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 3294 MachineInstr &Inst) const { 3295 MachineBasicBlock &MBB = *Inst.getParent(); 3296 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3297 MachineBasicBlock::iterator MII = Inst; 3298 DebugLoc DL = Inst.getDebugLoc(); 3299 3300 MachineOperand &Dest = Inst.getOperand(0); 3301 MachineOperand &Src = Inst.getOperand(1); 3302 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3303 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3304 3305 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 3306 .addImm(0) 3307 .addReg(Src.getReg()); 3308 3309 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 3310 .addReg(Src.getReg()) 3311 .addReg(TmpReg); 3312 3313 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3314 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3315 } 3316 3317 void SIInstrInfo::splitScalar64BitUnaryOp( 3318 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3319 unsigned Opcode) const { 3320 MachineBasicBlock &MBB = *Inst.getParent(); 3321 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3322 3323 MachineOperand &Dest = Inst.getOperand(0); 3324 MachineOperand &Src0 = Inst.getOperand(1); 3325 DebugLoc DL = Inst.getDebugLoc(); 3326 3327 MachineBasicBlock::iterator MII = Inst; 3328 3329 const MCInstrDesc &InstDesc = get(Opcode); 3330 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3331 MRI.getRegClass(Src0.getReg()) : 3332 &AMDGPU::SGPR_32RegClass; 3333 3334 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3335 3336 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3337 AMDGPU::sub0, Src0SubRC); 3338 3339 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3340 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3341 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3342 3343 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3344 BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 3345 3346 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3347 AMDGPU::sub1, Src0SubRC); 3348 3349 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3350 BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 3351 3352 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3353 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3354 .addReg(DestSub0) 3355 .addImm(AMDGPU::sub0) 3356 .addReg(DestSub1) 3357 .addImm(AMDGPU::sub1); 3358 3359 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3360 3361 // We don't need to legalizeOperands here because for a single operand, src0 3362 // will support any kind of input. 3363 3364 // Move all users of this moved value. 3365 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3366 } 3367 3368 void SIInstrInfo::splitScalar64BitBinaryOp( 3369 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3370 unsigned Opcode) const { 3371 MachineBasicBlock &MBB = *Inst.getParent(); 3372 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3373 3374 MachineOperand &Dest = Inst.getOperand(0); 3375 MachineOperand &Src0 = Inst.getOperand(1); 3376 MachineOperand &Src1 = Inst.getOperand(2); 3377 DebugLoc DL = Inst.getDebugLoc(); 3378 3379 MachineBasicBlock::iterator MII = Inst; 3380 3381 const MCInstrDesc &InstDesc = get(Opcode); 3382 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3383 MRI.getRegClass(Src0.getReg()) : 3384 &AMDGPU::SGPR_32RegClass; 3385 3386 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3387 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3388 MRI.getRegClass(Src1.getReg()) : 3389 &AMDGPU::SGPR_32RegClass; 3390 3391 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3392 3393 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3394 AMDGPU::sub0, Src0SubRC); 3395 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3396 AMDGPU::sub0, Src1SubRC); 3397 3398 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3399 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3400 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3401 3402 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3403 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3404 .add(SrcReg0Sub0) 3405 .add(SrcReg1Sub0); 3406 3407 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3408 AMDGPU::sub1, Src0SubRC); 3409 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3410 AMDGPU::sub1, Src1SubRC); 3411 3412 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3413 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3414 .add(SrcReg0Sub1) 3415 .add(SrcReg1Sub1); 3416 3417 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3418 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3419 .addReg(DestSub0) 3420 .addImm(AMDGPU::sub0) 3421 .addReg(DestSub1) 3422 .addImm(AMDGPU::sub1); 3423 3424 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3425 3426 // Try to legalize the operands in case we need to swap the order to keep it 3427 // valid. 3428 legalizeOperands(LoHalf); 3429 legalizeOperands(HiHalf); 3430 3431 // Move all users of this moved vlaue. 3432 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3433 } 3434 3435 void SIInstrInfo::splitScalar64BitBCNT( 3436 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 3437 MachineBasicBlock &MBB = *Inst.getParent(); 3438 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3439 3440 MachineBasicBlock::iterator MII = Inst; 3441 DebugLoc DL = Inst.getDebugLoc(); 3442 3443 MachineOperand &Dest = Inst.getOperand(0); 3444 MachineOperand &Src = Inst.getOperand(1); 3445 3446 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3447 const TargetRegisterClass *SrcRC = Src.isReg() ? 3448 MRI.getRegClass(Src.getReg()) : 3449 &AMDGPU::SGPR_32RegClass; 3450 3451 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3452 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3453 3454 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3455 3456 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3457 AMDGPU::sub0, SrcSubRC); 3458 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3459 AMDGPU::sub1, SrcSubRC); 3460 3461 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 3462 3463 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 3464 3465 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3466 3467 // We don't need to legalize operands here. src0 for etiher instruction can be 3468 // an SGPR, and the second input is unused or determined here. 3469 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3470 } 3471 3472 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 3473 MachineInstr &Inst) const { 3474 MachineBasicBlock &MBB = *Inst.getParent(); 3475 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3476 MachineBasicBlock::iterator MII = Inst; 3477 DebugLoc DL = Inst.getDebugLoc(); 3478 3479 MachineOperand &Dest = Inst.getOperand(0); 3480 uint32_t Imm = Inst.getOperand(2).getImm(); 3481 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3482 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3483 3484 (void) Offset; 3485 3486 // Only sext_inreg cases handled. 3487 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3488 Offset == 0 && "Not implemented"); 3489 3490 if (BitWidth < 32) { 3491 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3492 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3493 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3494 3495 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3496 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3497 .addImm(0) 3498 .addImm(BitWidth); 3499 3500 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3501 .addImm(31) 3502 .addReg(MidRegLo); 3503 3504 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3505 .addReg(MidRegLo) 3506 .addImm(AMDGPU::sub0) 3507 .addReg(MidRegHi) 3508 .addImm(AMDGPU::sub1); 3509 3510 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3511 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3512 return; 3513 } 3514 3515 MachineOperand &Src = Inst.getOperand(1); 3516 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3517 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3518 3519 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 3520 .addImm(31) 3521 .addReg(Src.getReg(), 0, AMDGPU::sub0); 3522 3523 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3524 .addReg(Src.getReg(), 0, AMDGPU::sub0) 3525 .addImm(AMDGPU::sub0) 3526 .addReg(TmpReg) 3527 .addImm(AMDGPU::sub1); 3528 3529 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3530 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3531 } 3532 3533 void SIInstrInfo::addUsersToMoveToVALUWorklist( 3534 unsigned DstReg, 3535 MachineRegisterInfo &MRI, 3536 SmallVectorImpl<MachineInstr *> &Worklist) const { 3537 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 3538 E = MRI.use_end(); I != E;) { 3539 MachineInstr &UseMI = *I->getParent(); 3540 if (!canReadVGPR(UseMI, I.getOperandNo())) { 3541 Worklist.push_back(&UseMI); 3542 3543 do { 3544 ++I; 3545 } while (I != E && I->getParent() == &UseMI); 3546 } else { 3547 ++I; 3548 } 3549 } 3550 } 3551 3552 void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, 3553 MachineRegisterInfo &MRI, 3554 MachineInstr &Inst) const { 3555 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3556 MachineBasicBlock *MBB = Inst.getParent(); 3557 MachineOperand &Src0 = Inst.getOperand(1); 3558 MachineOperand &Src1 = Inst.getOperand(2); 3559 const DebugLoc &DL = Inst.getDebugLoc(); 3560 3561 switch (Inst.getOpcode()) { 3562 case AMDGPU::S_PACK_LL_B32_B16: { 3563 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3564 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3565 3566 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 3567 // 0. 3568 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3569 .addImm(0xffff); 3570 3571 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 3572 .addReg(ImmReg, RegState::Kill) 3573 .add(Src0); 3574 3575 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 3576 .add(Src1) 3577 .addImm(16) 3578 .addReg(TmpReg, RegState::Kill); 3579 break; 3580 } 3581 case AMDGPU::S_PACK_LH_B32_B16: { 3582 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3583 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3584 .addImm(0xffff); 3585 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 3586 .addReg(ImmReg, RegState::Kill) 3587 .add(Src0) 3588 .add(Src1); 3589 break; 3590 } 3591 case AMDGPU::S_PACK_HH_B32_B16: { 3592 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3593 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3594 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 3595 .addImm(16) 3596 .add(Src0); 3597 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3598 .addImm(0xffff); 3599 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 3600 .add(Src1) 3601 .addReg(ImmReg, RegState::Kill) 3602 .addReg(TmpReg, RegState::Kill); 3603 break; 3604 } 3605 default: 3606 llvm_unreachable("unhandled s_pack_* instruction"); 3607 } 3608 3609 MachineOperand &Dest = Inst.getOperand(0); 3610 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3611 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3612 } 3613 3614 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 3615 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 3616 // This assumes that all the users of SCC are in the same block 3617 // as the SCC def. 3618 for (MachineInstr &MI : 3619 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 3620 SCCDefInst.getParent()->end())) { 3621 // Exit if we find another SCC def. 3622 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 3623 return; 3624 3625 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 3626 Worklist.push_back(&MI); 3627 } 3628 } 3629 3630 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 3631 const MachineInstr &Inst) const { 3632 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3633 3634 switch (Inst.getOpcode()) { 3635 // For target instructions, getOpRegClass just returns the virtual register 3636 // class associated with the operand, so we need to find an equivalent VGPR 3637 // register class in order to move the instruction to the VALU. 3638 case AMDGPU::COPY: 3639 case AMDGPU::PHI: 3640 case AMDGPU::REG_SEQUENCE: 3641 case AMDGPU::INSERT_SUBREG: 3642 if (RI.hasVGPRs(NewDstRC)) 3643 return nullptr; 3644 3645 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3646 if (!NewDstRC) 3647 return nullptr; 3648 return NewDstRC; 3649 default: 3650 return NewDstRC; 3651 } 3652 } 3653 3654 // Find the one SGPR operand we are allowed to use. 3655 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3656 int OpIndices[3]) const { 3657 const MCInstrDesc &Desc = MI.getDesc(); 3658 3659 // Find the one SGPR operand we are allowed to use. 3660 // 3661 // First we need to consider the instruction's operand requirements before 3662 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3663 // of VCC, but we are still bound by the constant bus requirement to only use 3664 // one. 3665 // 3666 // If the operand's class is an SGPR, we can never move it. 3667 3668 unsigned SGPRReg = findImplicitSGPRRead(MI); 3669 if (SGPRReg != AMDGPU::NoRegister) 3670 return SGPRReg; 3671 3672 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3673 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3674 3675 for (unsigned i = 0; i < 3; ++i) { 3676 int Idx = OpIndices[i]; 3677 if (Idx == -1) 3678 break; 3679 3680 const MachineOperand &MO = MI.getOperand(Idx); 3681 if (!MO.isReg()) 3682 continue; 3683 3684 // Is this operand statically required to be an SGPR based on the operand 3685 // constraints? 3686 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3687 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3688 if (IsRequiredSGPR) 3689 return MO.getReg(); 3690 3691 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3692 unsigned Reg = MO.getReg(); 3693 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3694 if (RI.isSGPRClass(RegRC)) 3695 UsedSGPRs[i] = Reg; 3696 } 3697 3698 // We don't have a required SGPR operand, so we have a bit more freedom in 3699 // selecting operands to move. 3700 3701 // Try to select the most used SGPR. If an SGPR is equal to one of the 3702 // others, we choose that. 3703 // 3704 // e.g. 3705 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3706 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3707 3708 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3709 // prefer those. 3710 3711 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3712 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3713 SGPRReg = UsedSGPRs[0]; 3714 } 3715 3716 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3717 if (UsedSGPRs[1] == UsedSGPRs[2]) 3718 SGPRReg = UsedSGPRs[1]; 3719 } 3720 3721 return SGPRReg; 3722 } 3723 3724 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3725 unsigned OperandName) const { 3726 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3727 if (Idx == -1) 3728 return nullptr; 3729 3730 return &MI.getOperand(Idx); 3731 } 3732 3733 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3734 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3735 if (ST.isAmdHsaOS()) { 3736 // Set ATC = 1. GFX9 doesn't have this bit. 3737 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) 3738 RsrcDataFormat |= (1ULL << 56); 3739 3740 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 3741 // BTW, it disables TC L2 and therefore decreases performance. 3742 if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) 3743 RsrcDataFormat |= (2ULL << 59); 3744 } 3745 3746 return RsrcDataFormat; 3747 } 3748 3749 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3750 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3751 AMDGPU::RSRC_TID_ENABLE | 3752 0xffffffff; // Size; 3753 3754 // GFX9 doesn't have ELEMENT_SIZE. 3755 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { 3756 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3757 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 3758 } 3759 3760 // IndexStride = 64. 3761 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 3762 3763 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 3764 // Clear them unless we want a huge stride. 3765 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3766 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 3767 3768 return Rsrc23; 3769 } 3770 3771 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 3772 unsigned Opc = MI.getOpcode(); 3773 3774 return isSMRD(Opc); 3775 } 3776 3777 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 3778 unsigned Opc = MI.getOpcode(); 3779 3780 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 3781 } 3782 3783 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 3784 int &FrameIndex) const { 3785 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3786 if (!Addr || !Addr->isFI()) 3787 return AMDGPU::NoRegister; 3788 3789 assert(!MI.memoperands_empty() && 3790 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); 3791 3792 FrameIndex = Addr->getIndex(); 3793 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 3794 } 3795 3796 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 3797 int &FrameIndex) const { 3798 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 3799 assert(Addr && Addr->isFI()); 3800 FrameIndex = Addr->getIndex(); 3801 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 3802 } 3803 3804 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 3805 int &FrameIndex) const { 3806 3807 if (!MI.mayLoad()) 3808 return AMDGPU::NoRegister; 3809 3810 if (isMUBUF(MI) || isVGPRSpill(MI)) 3811 return isStackAccess(MI, FrameIndex); 3812 3813 if (isSGPRSpill(MI)) 3814 return isSGPRStackAccess(MI, FrameIndex); 3815 3816 return AMDGPU::NoRegister; 3817 } 3818 3819 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 3820 int &FrameIndex) const { 3821 if (!MI.mayStore()) 3822 return AMDGPU::NoRegister; 3823 3824 if (isMUBUF(MI) || isVGPRSpill(MI)) 3825 return isStackAccess(MI, FrameIndex); 3826 3827 if (isSGPRSpill(MI)) 3828 return isSGPRStackAccess(MI, FrameIndex); 3829 3830 return AMDGPU::NoRegister; 3831 } 3832 3833 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 3834 unsigned Opc = MI.getOpcode(); 3835 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 3836 unsigned DescSize = Desc.getSize(); 3837 3838 // If we have a definitive size, we can use it. Otherwise we need to inspect 3839 // the operands to know the size. 3840 // 3841 // FIXME: Instructions that have a base 32-bit encoding report their size as 3842 // 4, even though they are really 8 bytes if they have a literal operand. 3843 if (DescSize != 0 && DescSize != 4) 3844 return DescSize; 3845 3846 // 4-byte instructions may have a 32-bit literal encoded after them. Check 3847 // operands that coud ever be literals. 3848 if (isVALU(MI) || isSALU(MI)) { 3849 if (isFixedSize(MI)) 3850 return DescSize; 3851 3852 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3853 if (Src0Idx == -1) 3854 return 4; // No operands. 3855 3856 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 3857 return 8; 3858 3859 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3860 if (Src1Idx == -1) 3861 return 4; 3862 3863 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 3864 return 8; 3865 3866 return 4; 3867 } 3868 3869 if (DescSize == 4) 3870 return 4; 3871 3872 switch (Opc) { 3873 case TargetOpcode::IMPLICIT_DEF: 3874 case TargetOpcode::KILL: 3875 case TargetOpcode::DBG_VALUE: 3876 case TargetOpcode::BUNDLE: 3877 case TargetOpcode::EH_LABEL: 3878 return 0; 3879 case TargetOpcode::INLINEASM: { 3880 const MachineFunction *MF = MI.getParent()->getParent(); 3881 const char *AsmStr = MI.getOperand(0).getSymbolName(); 3882 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 3883 } 3884 default: 3885 llvm_unreachable("unable to find instruction size"); 3886 } 3887 } 3888 3889 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 3890 if (!isFLAT(MI)) 3891 return false; 3892 3893 if (MI.memoperands_empty()) 3894 return true; 3895 3896 for (const MachineMemOperand *MMO : MI.memoperands()) { 3897 if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) 3898 return true; 3899 } 3900 return false; 3901 } 3902 3903 ArrayRef<std::pair<int, const char *>> 3904 SIInstrInfo::getSerializableTargetIndices() const { 3905 static const std::pair<int, const char *> TargetIndices[] = { 3906 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 3907 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 3908 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 3909 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 3910 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 3911 return makeArrayRef(TargetIndices); 3912 } 3913 3914 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 3915 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 3916 ScheduleHazardRecognizer * 3917 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 3918 const ScheduleDAG *DAG) const { 3919 return new GCNHazardRecognizer(DAG->MF); 3920 } 3921 3922 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 3923 /// pass. 3924 ScheduleHazardRecognizer * 3925 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 3926 return new GCNHazardRecognizer(MF); 3927 } 3928 3929 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 3930 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 3931 MI.modifiesRegister(AMDGPU::EXEC, &RI); 3932 } 3933 3934 MachineInstrBuilder 3935 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 3936 MachineBasicBlock::iterator I, 3937 const DebugLoc &DL, 3938 unsigned DestReg) const { 3939 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3940 3941 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3942 3943 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 3944 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 3945 } 3946