1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/ScheduleDAG.h" 24 #include "llvm/IR/Function.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/Debug.h" 28 29 using namespace llvm; 30 31 // Must be at least 4 to be able to branch over minimum unconditional branch 32 // code. This is only for making it possible to write reasonably small tests for 33 // long branches. 34 static cl::opt<unsigned> 35 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 36 cl::desc("Restrict range of branch instructions (DEBUG)")); 37 38 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 39 : AMDGPUInstrInfo(ST), RI(), ST(ST) {} 40 41 //===----------------------------------------------------------------------===// 42 // TargetInstrInfo callbacks 43 //===----------------------------------------------------------------------===// 44 45 static unsigned getNumOperandsNoGlue(SDNode *Node) { 46 unsigned N = Node->getNumOperands(); 47 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 48 --N; 49 return N; 50 } 51 52 static SDValue findChainOperand(SDNode *Load) { 53 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 54 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 55 return LastOp; 56 } 57 58 /// \brief Returns true if both nodes have the same value for the given 59 /// operand \p Op, or if both nodes do not have this operand. 60 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 61 unsigned Opc0 = N0->getMachineOpcode(); 62 unsigned Opc1 = N1->getMachineOpcode(); 63 64 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 65 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 66 67 if (Op0Idx == -1 && Op1Idx == -1) 68 return true; 69 70 71 if ((Op0Idx == -1 && Op1Idx != -1) || 72 (Op1Idx == -1 && Op0Idx != -1)) 73 return false; 74 75 // getNamedOperandIdx returns the index for the MachineInstr's operands, 76 // which includes the result as the first operand. We are indexing into the 77 // MachineSDNode's operands, so we need to skip the result operand to get 78 // the real index. 79 --Op0Idx; 80 --Op1Idx; 81 82 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 83 } 84 85 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 86 AliasAnalysis *AA) const { 87 // TODO: The generic check fails for VALU instructions that should be 88 // rematerializable due to implicit reads of exec. We really want all of the 89 // generic logic for this except for this. 90 switch (MI.getOpcode()) { 91 case AMDGPU::V_MOV_B32_e32: 92 case AMDGPU::V_MOV_B32_e64: 93 case AMDGPU::V_MOV_B64_PSEUDO: 94 return true; 95 default: 96 return false; 97 } 98 } 99 100 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 101 int64_t &Offset0, 102 int64_t &Offset1) const { 103 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 104 return false; 105 106 unsigned Opc0 = Load0->getMachineOpcode(); 107 unsigned Opc1 = Load1->getMachineOpcode(); 108 109 // Make sure both are actually loads. 110 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 111 return false; 112 113 if (isDS(Opc0) && isDS(Opc1)) { 114 115 // FIXME: Handle this case: 116 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 117 return false; 118 119 // Check base reg. 120 if (Load0->getOperand(1) != Load1->getOperand(1)) 121 return false; 122 123 // Check chain. 124 if (findChainOperand(Load0) != findChainOperand(Load1)) 125 return false; 126 127 // Skip read2 / write2 variants for simplicity. 128 // TODO: We should report true if the used offsets are adjacent (excluded 129 // st64 versions). 130 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 131 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 132 return false; 133 134 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 135 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 136 return true; 137 } 138 139 if (isSMRD(Opc0) && isSMRD(Opc1)) { 140 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 141 142 // Check base reg. 143 if (Load0->getOperand(0) != Load1->getOperand(0)) 144 return false; 145 146 const ConstantSDNode *Load0Offset = 147 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 148 const ConstantSDNode *Load1Offset = 149 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 150 151 if (!Load0Offset || !Load1Offset) 152 return false; 153 154 // Check chain. 155 if (findChainOperand(Load0) != findChainOperand(Load1)) 156 return false; 157 158 Offset0 = Load0Offset->getZExtValue(); 159 Offset1 = Load1Offset->getZExtValue(); 160 return true; 161 } 162 163 // MUBUF and MTBUF can access the same addresses. 164 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 165 166 // MUBUF and MTBUF have vaddr at different indices. 167 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 168 findChainOperand(Load0) != findChainOperand(Load1) || 169 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 170 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 171 return false; 172 173 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 174 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 175 176 if (OffIdx0 == -1 || OffIdx1 == -1) 177 return false; 178 179 // getNamedOperandIdx returns the index for MachineInstrs. Since they 180 // inlcude the output in the operand list, but SDNodes don't, we need to 181 // subtract the index by one. 182 --OffIdx0; 183 --OffIdx1; 184 185 SDValue Off0 = Load0->getOperand(OffIdx0); 186 SDValue Off1 = Load1->getOperand(OffIdx1); 187 188 // The offset might be a FrameIndexSDNode. 189 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 190 return false; 191 192 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 193 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 194 return true; 195 } 196 197 return false; 198 } 199 200 static bool isStride64(unsigned Opc) { 201 switch (Opc) { 202 case AMDGPU::DS_READ2ST64_B32: 203 case AMDGPU::DS_READ2ST64_B64: 204 case AMDGPU::DS_WRITE2ST64_B32: 205 case AMDGPU::DS_WRITE2ST64_B64: 206 return true; 207 default: 208 return false; 209 } 210 } 211 212 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 213 int64_t &Offset, 214 const TargetRegisterInfo *TRI) const { 215 unsigned Opc = LdSt.getOpcode(); 216 217 if (isDS(LdSt)) { 218 const MachineOperand *OffsetImm = 219 getNamedOperand(LdSt, AMDGPU::OpName::offset); 220 if (OffsetImm) { 221 // Normal, single offset LDS instruction. 222 const MachineOperand *AddrReg = 223 getNamedOperand(LdSt, AMDGPU::OpName::addr); 224 225 BaseReg = AddrReg->getReg(); 226 Offset = OffsetImm->getImm(); 227 return true; 228 } 229 230 // The 2 offset instructions use offset0 and offset1 instead. We can treat 231 // these as a load with a single offset if the 2 offsets are consecutive. We 232 // will use this for some partially aligned loads. 233 const MachineOperand *Offset0Imm = 234 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 235 const MachineOperand *Offset1Imm = 236 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 237 238 uint8_t Offset0 = Offset0Imm->getImm(); 239 uint8_t Offset1 = Offset1Imm->getImm(); 240 241 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 242 // Each of these offsets is in element sized units, so we need to convert 243 // to bytes of the individual reads. 244 245 unsigned EltSize; 246 if (LdSt.mayLoad()) 247 EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; 248 else { 249 assert(LdSt.mayStore()); 250 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 251 EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); 252 } 253 254 if (isStride64(Opc)) 255 EltSize *= 64; 256 257 const MachineOperand *AddrReg = 258 getNamedOperand(LdSt, AMDGPU::OpName::addr); 259 BaseReg = AddrReg->getReg(); 260 Offset = EltSize * Offset0; 261 return true; 262 } 263 264 return false; 265 } 266 267 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 268 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 269 return false; 270 271 const MachineOperand *AddrReg = 272 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 273 if (!AddrReg) 274 return false; 275 276 const MachineOperand *OffsetImm = 277 getNamedOperand(LdSt, AMDGPU::OpName::offset); 278 BaseReg = AddrReg->getReg(); 279 Offset = OffsetImm->getImm(); 280 return true; 281 } 282 283 if (isSMRD(LdSt)) { 284 const MachineOperand *OffsetImm = 285 getNamedOperand(LdSt, AMDGPU::OpName::offset); 286 if (!OffsetImm) 287 return false; 288 289 const MachineOperand *SBaseReg = 290 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 291 BaseReg = SBaseReg->getReg(); 292 Offset = OffsetImm->getImm(); 293 return true; 294 } 295 296 if (isFLAT(LdSt)) { 297 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); 298 BaseReg = AddrReg->getReg(); 299 Offset = 0; 300 return true; 301 } 302 303 return false; 304 } 305 306 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 307 MachineInstr &SecondLdSt, 308 unsigned NumLoads) const { 309 const MachineOperand *FirstDst = nullptr; 310 const MachineOperand *SecondDst = nullptr; 311 312 if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 313 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 314 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 315 } 316 317 if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 318 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 319 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 320 } 321 322 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 323 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { 324 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 325 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 326 } 327 328 if (!FirstDst || !SecondDst) 329 return false; 330 331 // Try to limit clustering based on the total number of bytes loaded 332 // rather than the number of instructions. This is done to help reduce 333 // register pressure. The method used is somewhat inexact, though, 334 // because it assumes that all loads in the cluster will load the 335 // same number of bytes as FirstLdSt. 336 337 // The unit of this value is bytes. 338 // FIXME: This needs finer tuning. 339 unsigned LoadClusterThreshold = 16; 340 341 const MachineRegisterInfo &MRI = 342 FirstLdSt.getParent()->getParent()->getRegInfo(); 343 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 344 345 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 346 } 347 348 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 349 MachineBasicBlock::iterator MI, 350 const DebugLoc &DL, unsigned DestReg, 351 unsigned SrcReg, bool KillSrc) const { 352 353 static const int16_t Sub0_15[] = { 354 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 355 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 356 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 357 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 358 }; 359 360 static const int16_t Sub0_15_64[] = { 361 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 362 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 363 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 364 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 365 }; 366 367 static const int16_t Sub0_7[] = { 368 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 369 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 370 }; 371 372 static const int16_t Sub0_7_64[] = { 373 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 374 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 375 }; 376 377 static const int16_t Sub0_3[] = { 378 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 379 }; 380 381 static const int16_t Sub0_3_64[] = { 382 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 383 }; 384 385 static const int16_t Sub0_2[] = { 386 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 387 }; 388 389 static const int16_t Sub0_1[] = { 390 AMDGPU::sub0, AMDGPU::sub1, 391 }; 392 393 unsigned Opcode; 394 ArrayRef<int16_t> SubIndices; 395 396 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 397 if (SrcReg == AMDGPU::SCC) { 398 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 399 .addImm(-1) 400 .addImm(0); 401 return; 402 } 403 404 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 405 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 406 .addReg(SrcReg, getKillRegState(KillSrc)); 407 return; 408 409 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 410 if (DestReg == AMDGPU::VCC) { 411 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 412 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 413 .addReg(SrcReg, getKillRegState(KillSrc)); 414 } else { 415 // FIXME: Hack until VReg_1 removed. 416 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 417 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 418 .addImm(0) 419 .addReg(SrcReg, getKillRegState(KillSrc)); 420 } 421 422 return; 423 } 424 425 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 426 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 427 .addReg(SrcReg, getKillRegState(KillSrc)); 428 return; 429 430 } else if (DestReg == AMDGPU::SCC) { 431 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 432 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 433 .addReg(SrcReg, getKillRegState(KillSrc)) 434 .addImm(0); 435 return; 436 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 437 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 438 Opcode = AMDGPU::S_MOV_B64; 439 SubIndices = Sub0_3_64; 440 441 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 442 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 443 Opcode = AMDGPU::S_MOV_B64; 444 SubIndices = Sub0_7_64; 445 446 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 447 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 448 Opcode = AMDGPU::S_MOV_B64; 449 SubIndices = Sub0_15_64; 450 451 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 452 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 453 AMDGPU::SReg_32RegClass.contains(SrcReg)); 454 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 455 .addReg(SrcReg, getKillRegState(KillSrc)); 456 return; 457 458 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 459 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 460 AMDGPU::SReg_64RegClass.contains(SrcReg)); 461 Opcode = AMDGPU::V_MOV_B32_e32; 462 SubIndices = Sub0_1; 463 464 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 465 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 466 Opcode = AMDGPU::V_MOV_B32_e32; 467 SubIndices = Sub0_2; 468 469 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 470 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 471 AMDGPU::SReg_128RegClass.contains(SrcReg)); 472 Opcode = AMDGPU::V_MOV_B32_e32; 473 SubIndices = Sub0_3; 474 475 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 476 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 477 AMDGPU::SReg_256RegClass.contains(SrcReg)); 478 Opcode = AMDGPU::V_MOV_B32_e32; 479 SubIndices = Sub0_7; 480 481 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 482 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 483 AMDGPU::SReg_512RegClass.contains(SrcReg)); 484 Opcode = AMDGPU::V_MOV_B32_e32; 485 SubIndices = Sub0_15; 486 487 } else { 488 llvm_unreachable("Can't copy register!"); 489 } 490 491 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 492 493 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 494 unsigned SubIdx; 495 if (Forward) 496 SubIdx = SubIndices[Idx]; 497 else 498 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 499 500 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 501 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 502 503 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 504 505 if (Idx == SubIndices.size() - 1) 506 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 507 508 if (Idx == 0) 509 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 510 511 Builder.addReg(SrcReg, RegState::Implicit); 512 } 513 } 514 515 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 516 int NewOpc; 517 518 // Try to map original to commuted opcode 519 NewOpc = AMDGPU::getCommuteRev(Opcode); 520 if (NewOpc != -1) 521 // Check if the commuted (REV) opcode exists on the target. 522 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 523 524 // Try to map commuted to original opcode 525 NewOpc = AMDGPU::getCommuteOrig(Opcode); 526 if (NewOpc != -1) 527 // Check if the original (non-REV) opcode exists on the target. 528 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 529 530 return Opcode; 531 } 532 533 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 534 535 if (DstRC->getSize() == 4) { 536 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 537 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 538 return AMDGPU::S_MOV_B64; 539 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 540 return AMDGPU::V_MOV_B64_PSEUDO; 541 } 542 return AMDGPU::COPY; 543 } 544 545 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 546 switch (Size) { 547 case 4: 548 return AMDGPU::SI_SPILL_S32_SAVE; 549 case 8: 550 return AMDGPU::SI_SPILL_S64_SAVE; 551 case 16: 552 return AMDGPU::SI_SPILL_S128_SAVE; 553 case 32: 554 return AMDGPU::SI_SPILL_S256_SAVE; 555 case 64: 556 return AMDGPU::SI_SPILL_S512_SAVE; 557 default: 558 llvm_unreachable("unknown register size"); 559 } 560 } 561 562 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 563 switch (Size) { 564 case 4: 565 return AMDGPU::SI_SPILL_V32_SAVE; 566 case 8: 567 return AMDGPU::SI_SPILL_V64_SAVE; 568 case 12: 569 return AMDGPU::SI_SPILL_V96_SAVE; 570 case 16: 571 return AMDGPU::SI_SPILL_V128_SAVE; 572 case 32: 573 return AMDGPU::SI_SPILL_V256_SAVE; 574 case 64: 575 return AMDGPU::SI_SPILL_V512_SAVE; 576 default: 577 llvm_unreachable("unknown register size"); 578 } 579 } 580 581 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 582 MachineBasicBlock::iterator MI, 583 unsigned SrcReg, bool isKill, 584 int FrameIndex, 585 const TargetRegisterClass *RC, 586 const TargetRegisterInfo *TRI) const { 587 MachineFunction *MF = MBB.getParent(); 588 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 589 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 590 DebugLoc DL = MBB.findDebugLoc(MI); 591 592 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 593 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 594 MachinePointerInfo PtrInfo 595 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 596 MachineMemOperand *MMO 597 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 598 Size, Align); 599 600 if (RI.isSGPRClass(RC)) { 601 MFI->setHasSpilledSGPRs(); 602 603 // We are only allowed to create one new instruction when spilling 604 // registers, so we need to use pseudo instruction for spilling SGPRs. 605 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); 606 607 // The SGPR spill/restore instructions only work on number sgprs, so we need 608 // to make sure we are using the correct register class. 609 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { 610 MachineRegisterInfo &MRI = MF->getRegInfo(); 611 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 612 } 613 614 BuildMI(MBB, MI, DL, OpDesc) 615 .addReg(SrcReg, getKillRegState(isKill)) // data 616 .addFrameIndex(FrameIndex) // addr 617 .addMemOperand(MMO); 618 619 return; 620 } 621 622 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 623 LLVMContext &Ctx = MF->getFunction()->getContext(); 624 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 625 " spill register"); 626 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 627 .addReg(SrcReg); 628 629 return; 630 } 631 632 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 633 634 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 635 MFI->setHasSpilledVGPRs(); 636 BuildMI(MBB, MI, DL, get(Opcode)) 637 .addReg(SrcReg, getKillRegState(isKill)) // data 638 .addFrameIndex(FrameIndex) // addr 639 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 640 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 641 .addImm(0) // offset 642 .addMemOperand(MMO); 643 } 644 645 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 646 switch (Size) { 647 case 4: 648 return AMDGPU::SI_SPILL_S32_RESTORE; 649 case 8: 650 return AMDGPU::SI_SPILL_S64_RESTORE; 651 case 16: 652 return AMDGPU::SI_SPILL_S128_RESTORE; 653 case 32: 654 return AMDGPU::SI_SPILL_S256_RESTORE; 655 case 64: 656 return AMDGPU::SI_SPILL_S512_RESTORE; 657 default: 658 llvm_unreachable("unknown register size"); 659 } 660 } 661 662 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 663 switch (Size) { 664 case 4: 665 return AMDGPU::SI_SPILL_V32_RESTORE; 666 case 8: 667 return AMDGPU::SI_SPILL_V64_RESTORE; 668 case 12: 669 return AMDGPU::SI_SPILL_V96_RESTORE; 670 case 16: 671 return AMDGPU::SI_SPILL_V128_RESTORE; 672 case 32: 673 return AMDGPU::SI_SPILL_V256_RESTORE; 674 case 64: 675 return AMDGPU::SI_SPILL_V512_RESTORE; 676 default: 677 llvm_unreachable("unknown register size"); 678 } 679 } 680 681 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 682 MachineBasicBlock::iterator MI, 683 unsigned DestReg, int FrameIndex, 684 const TargetRegisterClass *RC, 685 const TargetRegisterInfo *TRI) const { 686 MachineFunction *MF = MBB.getParent(); 687 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 688 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 689 DebugLoc DL = MBB.findDebugLoc(MI); 690 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 691 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 692 693 MachinePointerInfo PtrInfo 694 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 695 696 MachineMemOperand *MMO = MF->getMachineMemOperand( 697 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 698 699 if (RI.isSGPRClass(RC)) { 700 // FIXME: Maybe this should not include a memoperand because it will be 701 // lowered to non-memory instructions. 702 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); 703 if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { 704 MachineRegisterInfo &MRI = MF->getRegInfo(); 705 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 706 } 707 708 BuildMI(MBB, MI, DL, OpDesc, DestReg) 709 .addFrameIndex(FrameIndex) // addr 710 .addMemOperand(MMO); 711 712 return; 713 } 714 715 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 716 LLVMContext &Ctx = MF->getFunction()->getContext(); 717 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 718 " restore register"); 719 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 720 721 return; 722 } 723 724 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 725 726 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 727 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 728 .addFrameIndex(FrameIndex) // vaddr 729 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 730 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 731 .addImm(0) // offset 732 .addMemOperand(MMO); 733 } 734 735 /// \param @Offset Offset in bytes of the FrameIndex being spilled 736 unsigned SIInstrInfo::calculateLDSSpillAddress( 737 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 738 unsigned FrameOffset, unsigned Size) const { 739 MachineFunction *MF = MBB.getParent(); 740 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 741 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 742 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 743 DebugLoc DL = MBB.findDebugLoc(MI); 744 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 745 unsigned WavefrontSize = ST.getWavefrontSize(); 746 747 unsigned TIDReg = MFI->getTIDReg(); 748 if (!MFI->hasCalculatedTID()) { 749 MachineBasicBlock &Entry = MBB.getParent()->front(); 750 MachineBasicBlock::iterator Insert = Entry.front(); 751 DebugLoc DL = Insert->getDebugLoc(); 752 753 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 754 *MF); 755 if (TIDReg == AMDGPU::NoRegister) 756 return TIDReg; 757 758 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 759 WorkGroupSize > WavefrontSize) { 760 761 unsigned TIDIGXReg 762 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 763 unsigned TIDIGYReg 764 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 765 unsigned TIDIGZReg 766 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 767 unsigned InputPtrReg = 768 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 769 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 770 if (!Entry.isLiveIn(Reg)) 771 Entry.addLiveIn(Reg); 772 } 773 774 RS->enterBasicBlock(Entry); 775 // FIXME: Can we scavenge an SReg_64 and access the subregs? 776 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 777 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 778 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 779 .addReg(InputPtrReg) 780 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 781 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 782 .addReg(InputPtrReg) 783 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 784 785 // NGROUPS.X * NGROUPS.Y 786 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 787 .addReg(STmp1) 788 .addReg(STmp0); 789 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 790 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 791 .addReg(STmp1) 792 .addReg(TIDIGXReg); 793 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 794 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 795 .addReg(STmp0) 796 .addReg(TIDIGYReg) 797 .addReg(TIDReg); 798 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 799 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 800 .addReg(TIDReg) 801 .addReg(TIDIGZReg); 802 } else { 803 // Get the wave id 804 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 805 TIDReg) 806 .addImm(-1) 807 .addImm(0); 808 809 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 810 TIDReg) 811 .addImm(-1) 812 .addReg(TIDReg); 813 } 814 815 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 816 TIDReg) 817 .addImm(2) 818 .addReg(TIDReg); 819 MFI->setTIDReg(TIDReg); 820 } 821 822 // Add FrameIndex to LDS offset 823 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 824 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 825 .addImm(LDSOffset) 826 .addReg(TIDReg); 827 828 return TmpReg; 829 } 830 831 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 832 MachineBasicBlock::iterator MI, 833 int Count) const { 834 DebugLoc DL = MBB.findDebugLoc(MI); 835 while (Count > 0) { 836 int Arg; 837 if (Count >= 8) 838 Arg = 7; 839 else 840 Arg = Count - 1; 841 Count -= 8; 842 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 843 .addImm(Arg); 844 } 845 } 846 847 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 848 MachineBasicBlock::iterator MI) const { 849 insertWaitStates(MBB, MI, 1); 850 } 851 852 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 853 switch (MI.getOpcode()) { 854 default: return 1; // FIXME: Do wait states equal cycles? 855 856 case AMDGPU::S_NOP: 857 return MI.getOperand(0).getImm() + 1; 858 } 859 } 860 861 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 862 MachineBasicBlock &MBB = *MI.getParent(); 863 DebugLoc DL = MBB.findDebugLoc(MI); 864 switch (MI.getOpcode()) { 865 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 866 case AMDGPU::S_MOV_B64_term: { 867 // This is only a terminator to get the correct spill code placement during 868 // register allocation. 869 MI.setDesc(get(AMDGPU::S_MOV_B64)); 870 break; 871 } 872 case AMDGPU::S_XOR_B64_term: { 873 // This is only a terminator to get the correct spill code placement during 874 // register allocation. 875 MI.setDesc(get(AMDGPU::S_XOR_B64)); 876 break; 877 } 878 case AMDGPU::S_ANDN2_B64_term: { 879 // This is only a terminator to get the correct spill code placement during 880 // register allocation. 881 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 882 break; 883 } 884 case AMDGPU::V_MOV_B64_PSEUDO: { 885 unsigned Dst = MI.getOperand(0).getReg(); 886 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 887 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 888 889 const MachineOperand &SrcOp = MI.getOperand(1); 890 // FIXME: Will this work for 64-bit floating point immediates? 891 assert(!SrcOp.isFPImm()); 892 if (SrcOp.isImm()) { 893 APInt Imm(64, SrcOp.getImm()); 894 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 895 .addImm(Imm.getLoBits(32).getZExtValue()) 896 .addReg(Dst, RegState::Implicit | RegState::Define); 897 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 898 .addImm(Imm.getHiBits(32).getZExtValue()) 899 .addReg(Dst, RegState::Implicit | RegState::Define); 900 } else { 901 assert(SrcOp.isReg()); 902 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 903 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 904 .addReg(Dst, RegState::Implicit | RegState::Define); 905 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 906 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 907 .addReg(Dst, RegState::Implicit | RegState::Define); 908 } 909 MI.eraseFromParent(); 910 break; 911 } 912 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 913 MachineFunction &MF = *MBB.getParent(); 914 unsigned Reg = MI.getOperand(0).getReg(); 915 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 916 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 917 918 // Create a bundle so these instructions won't be re-ordered by the 919 // post-RA scheduler. 920 MIBundleBuilder Bundler(MBB, MI); 921 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 922 923 // Add 32-bit offset from this instruction to the start of the 924 // constant data. 925 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 926 .addReg(RegLo) 927 .addOperand(MI.getOperand(1))); 928 929 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 930 .addReg(RegHi); 931 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 932 MIB.addImm(0); 933 else 934 MIB.addOperand(MI.getOperand(2)); 935 936 Bundler.append(MIB); 937 llvm::finalizeBundle(MBB, Bundler.begin()); 938 939 MI.eraseFromParent(); 940 break; 941 } 942 } 943 return true; 944 } 945 946 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 947 MachineOperand &Src0, 948 unsigned Src0OpName, 949 MachineOperand &Src1, 950 unsigned Src1OpName) const { 951 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 952 if (!Src0Mods) 953 return false; 954 955 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 956 assert(Src1Mods && 957 "All commutable instructions have both src0 and src1 modifiers"); 958 959 int Src0ModsVal = Src0Mods->getImm(); 960 int Src1ModsVal = Src1Mods->getImm(); 961 962 Src1Mods->setImm(Src0ModsVal); 963 Src0Mods->setImm(Src1ModsVal); 964 return true; 965 } 966 967 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 968 MachineOperand &RegOp, 969 MachineOperand &NonRegOp) { 970 unsigned Reg = RegOp.getReg(); 971 unsigned SubReg = RegOp.getSubReg(); 972 bool IsKill = RegOp.isKill(); 973 bool IsDead = RegOp.isDead(); 974 bool IsUndef = RegOp.isUndef(); 975 bool IsDebug = RegOp.isDebug(); 976 977 if (NonRegOp.isImm()) 978 RegOp.ChangeToImmediate(NonRegOp.getImm()); 979 else if (NonRegOp.isFI()) 980 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 981 else 982 return nullptr; 983 984 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 985 NonRegOp.setSubReg(SubReg); 986 987 return &MI; 988 } 989 990 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 991 unsigned Src0Idx, 992 unsigned Src1Idx) const { 993 assert(!NewMI && "this should never be used"); 994 995 unsigned Opc = MI.getOpcode(); 996 int CommutedOpcode = commuteOpcode(Opc); 997 if (CommutedOpcode == -1) 998 return nullptr; 999 1000 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1001 static_cast<int>(Src0Idx) && 1002 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1003 static_cast<int>(Src1Idx) && 1004 "inconsistency with findCommutedOpIndices"); 1005 1006 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1007 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1008 1009 MachineInstr *CommutedMI = nullptr; 1010 if (Src0.isReg() && Src1.isReg()) { 1011 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1012 // Be sure to copy the source modifiers to the right place. 1013 CommutedMI 1014 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1015 } 1016 1017 } else if (Src0.isReg() && !Src1.isReg()) { 1018 // src0 should always be able to support any operand type, so no need to 1019 // check operand legality. 1020 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1021 } else if (!Src0.isReg() && Src1.isReg()) { 1022 if (isOperandLegal(MI, Src1Idx, &Src0)) 1023 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1024 } else { 1025 // FIXME: Found two non registers to commute. This does happen. 1026 return nullptr; 1027 } 1028 1029 1030 if (CommutedMI) { 1031 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1032 Src1, AMDGPU::OpName::src1_modifiers); 1033 1034 CommutedMI->setDesc(get(CommutedOpcode)); 1035 } 1036 1037 return CommutedMI; 1038 } 1039 1040 // This needs to be implemented because the source modifiers may be inserted 1041 // between the true commutable operands, and the base 1042 // TargetInstrInfo::commuteInstruction uses it. 1043 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1044 unsigned &SrcOpIdx1) const { 1045 if (!MI.isCommutable()) 1046 return false; 1047 1048 unsigned Opc = MI.getOpcode(); 1049 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1050 if (Src0Idx == -1) 1051 return false; 1052 1053 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1054 if (Src1Idx == -1) 1055 return false; 1056 1057 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1058 } 1059 1060 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1061 int64_t BrOffset) const { 1062 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1063 // block is unanalyzable. 1064 assert(BranchOp != AMDGPU::S_SETPC_B64); 1065 1066 // Convert to dwords. 1067 BrOffset /= 4; 1068 1069 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1070 // from the next instruction. 1071 BrOffset -= 1; 1072 1073 return isIntN(BranchOffsetBits, BrOffset); 1074 } 1075 1076 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1077 const MachineInstr &MI) const { 1078 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1079 // This would be a difficult analysis to perform, but can always be legal so 1080 // there's no need to analyze it. 1081 return nullptr; 1082 } 1083 1084 return MI.getOperand(0).getMBB(); 1085 } 1086 1087 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1088 MachineBasicBlock &DestBB, 1089 const DebugLoc &DL, 1090 int64_t BrOffset, 1091 RegScavenger *RS) const { 1092 assert(RS && "RegScavenger required for long branching"); 1093 assert(MBB.empty() && 1094 "new block should be inserted for expanding unconditional branch"); 1095 assert(MBB.pred_size() == 1); 1096 1097 MachineFunction *MF = MBB.getParent(); 1098 MachineRegisterInfo &MRI = MF->getRegInfo(); 1099 1100 // FIXME: Virtual register workaround for RegScavenger not working with empty 1101 // blocks. 1102 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1103 1104 auto I = MBB.end(); 1105 1106 // We need to compute the offset relative to the instruction immediately after 1107 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1108 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1109 1110 // TODO: Handle > 32-bit block address. 1111 if (BrOffset >= 0) { 1112 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1113 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1114 .addReg(PCReg, 0, AMDGPU::sub0) 1115 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1116 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1117 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1118 .addReg(PCReg, 0, AMDGPU::sub1) 1119 .addImm(0); 1120 } else { 1121 // Backwards branch. 1122 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1123 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1124 .addReg(PCReg, 0, AMDGPU::sub0) 1125 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1126 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1127 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1128 .addReg(PCReg, 0, AMDGPU::sub1) 1129 .addImm(0); 1130 } 1131 1132 // Insert the indirect branch after the other terminator. 1133 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1134 .addReg(PCReg); 1135 1136 // FIXME: If spilling is necessary, this will fail because this scavenger has 1137 // no emergency stack slots. It is non-trivial to spill in this situation, 1138 // because the restore code needs to be specially placed after the 1139 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1140 // block. 1141 // 1142 // If a spill is needed for the pc register pair, we need to insert a spill 1143 // restore block right before the destination block, and insert a short branch 1144 // into the old destination block's fallthrough predecessor. 1145 // e.g.: 1146 // 1147 // s_cbranch_scc0 skip_long_branch: 1148 // 1149 // long_branch_bb: 1150 // spill s[8:9] 1151 // s_getpc_b64 s[8:9] 1152 // s_add_u32 s8, s8, restore_bb 1153 // s_addc_u32 s9, s9, 0 1154 // s_setpc_b64 s[8:9] 1155 // 1156 // skip_long_branch: 1157 // foo; 1158 // 1159 // ..... 1160 // 1161 // dest_bb_fallthrough_predecessor: 1162 // bar; 1163 // s_branch dest_bb 1164 // 1165 // restore_bb: 1166 // restore s[8:9] 1167 // fallthrough dest_bb 1168 /// 1169 // dest_bb: 1170 // buzz; 1171 1172 RS->enterBasicBlockEnd(MBB); 1173 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1174 MachineBasicBlock::iterator(GetPC), 0); 1175 MRI.replaceRegWith(PCReg, Scav); 1176 MRI.clearVirtRegs(); 1177 RS->setRegUsed(Scav); 1178 1179 return 4 + 8 + 4 + 4; 1180 } 1181 1182 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1183 switch (Cond) { 1184 case SIInstrInfo::SCC_TRUE: 1185 return AMDGPU::S_CBRANCH_SCC1; 1186 case SIInstrInfo::SCC_FALSE: 1187 return AMDGPU::S_CBRANCH_SCC0; 1188 case SIInstrInfo::VCCNZ: 1189 return AMDGPU::S_CBRANCH_VCCNZ; 1190 case SIInstrInfo::VCCZ: 1191 return AMDGPU::S_CBRANCH_VCCZ; 1192 case SIInstrInfo::EXECNZ: 1193 return AMDGPU::S_CBRANCH_EXECNZ; 1194 case SIInstrInfo::EXECZ: 1195 return AMDGPU::S_CBRANCH_EXECZ; 1196 default: 1197 llvm_unreachable("invalid branch predicate"); 1198 } 1199 } 1200 1201 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1202 switch (Opcode) { 1203 case AMDGPU::S_CBRANCH_SCC0: 1204 return SCC_FALSE; 1205 case AMDGPU::S_CBRANCH_SCC1: 1206 return SCC_TRUE; 1207 case AMDGPU::S_CBRANCH_VCCNZ: 1208 return VCCNZ; 1209 case AMDGPU::S_CBRANCH_VCCZ: 1210 return VCCZ; 1211 case AMDGPU::S_CBRANCH_EXECNZ: 1212 return EXECNZ; 1213 case AMDGPU::S_CBRANCH_EXECZ: 1214 return EXECZ; 1215 default: 1216 return INVALID_BR; 1217 } 1218 } 1219 1220 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1221 MachineBasicBlock::iterator I, 1222 MachineBasicBlock *&TBB, 1223 MachineBasicBlock *&FBB, 1224 SmallVectorImpl<MachineOperand> &Cond, 1225 bool AllowModify) const { 1226 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1227 // Unconditional Branch 1228 TBB = I->getOperand(0).getMBB(); 1229 return false; 1230 } 1231 1232 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1233 if (Pred == INVALID_BR) 1234 return true; 1235 1236 MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); 1237 Cond.push_back(MachineOperand::CreateImm(Pred)); 1238 1239 ++I; 1240 1241 if (I == MBB.end()) { 1242 // Conditional branch followed by fall-through. 1243 TBB = CondBB; 1244 return false; 1245 } 1246 1247 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1248 TBB = CondBB; 1249 FBB = I->getOperand(0).getMBB(); 1250 return false; 1251 } 1252 1253 return true; 1254 } 1255 1256 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1257 MachineBasicBlock *&FBB, 1258 SmallVectorImpl<MachineOperand> &Cond, 1259 bool AllowModify) const { 1260 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1261 if (I == MBB.end()) 1262 return false; 1263 1264 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1265 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1266 1267 ++I; 1268 1269 // TODO: Should be able to treat as fallthrough? 1270 if (I == MBB.end()) 1271 return true; 1272 1273 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1274 return true; 1275 1276 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1277 1278 // Specifically handle the case where the conditional branch is to the same 1279 // destination as the mask branch. e.g. 1280 // 1281 // si_mask_branch BB8 1282 // s_cbranch_execz BB8 1283 // s_cbranch BB9 1284 // 1285 // This is required to understand divergent loops which may need the branches 1286 // to be relaxed. 1287 if (TBB != MaskBrDest || Cond.empty()) 1288 return true; 1289 1290 auto Pred = Cond[0].getImm(); 1291 return (Pred != EXECZ && Pred != EXECNZ); 1292 } 1293 1294 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1295 int *BytesRemoved) const { 1296 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1297 1298 unsigned Count = 0; 1299 unsigned RemovedSize = 0; 1300 while (I != MBB.end()) { 1301 MachineBasicBlock::iterator Next = std::next(I); 1302 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1303 I = Next; 1304 continue; 1305 } 1306 1307 RemovedSize += getInstSizeInBytes(*I); 1308 I->eraseFromParent(); 1309 ++Count; 1310 I = Next; 1311 } 1312 1313 if (BytesRemoved) 1314 *BytesRemoved = RemovedSize; 1315 1316 return Count; 1317 } 1318 1319 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1320 MachineBasicBlock *TBB, 1321 MachineBasicBlock *FBB, 1322 ArrayRef<MachineOperand> Cond, 1323 const DebugLoc &DL, 1324 int *BytesAdded) const { 1325 1326 if (!FBB && Cond.empty()) { 1327 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1328 .addMBB(TBB); 1329 if (BytesAdded) 1330 *BytesAdded = 4; 1331 return 1; 1332 } 1333 1334 assert(TBB && Cond[0].isImm()); 1335 1336 unsigned Opcode 1337 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1338 1339 if (!FBB) { 1340 BuildMI(&MBB, DL, get(Opcode)) 1341 .addMBB(TBB); 1342 1343 if (BytesAdded) 1344 *BytesAdded = 4; 1345 return 1; 1346 } 1347 1348 assert(TBB && FBB); 1349 1350 BuildMI(&MBB, DL, get(Opcode)) 1351 .addMBB(TBB); 1352 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1353 .addMBB(FBB); 1354 1355 if (BytesAdded) 1356 *BytesAdded = 8; 1357 1358 return 2; 1359 } 1360 1361 bool SIInstrInfo::reverseBranchCondition( 1362 SmallVectorImpl<MachineOperand> &Cond) const { 1363 assert(Cond.size() == 1); 1364 Cond[0].setImm(-Cond[0].getImm()); 1365 return false; 1366 } 1367 1368 static void removeModOperands(MachineInstr &MI) { 1369 unsigned Opc = MI.getOpcode(); 1370 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1371 AMDGPU::OpName::src0_modifiers); 1372 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1373 AMDGPU::OpName::src1_modifiers); 1374 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1375 AMDGPU::OpName::src2_modifiers); 1376 1377 MI.RemoveOperand(Src2ModIdx); 1378 MI.RemoveOperand(Src1ModIdx); 1379 MI.RemoveOperand(Src0ModIdx); 1380 } 1381 1382 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1383 unsigned Reg, MachineRegisterInfo *MRI) const { 1384 if (!MRI->hasOneNonDBGUse(Reg)) 1385 return false; 1386 1387 unsigned Opc = UseMI.getOpcode(); 1388 if (Opc == AMDGPU::COPY) { 1389 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1390 switch (DefMI.getOpcode()) { 1391 default: 1392 return false; 1393 case AMDGPU::S_MOV_B64: 1394 // TODO: We could fold 64-bit immediates, but this get compilicated 1395 // when there are sub-registers. 1396 return false; 1397 1398 case AMDGPU::V_MOV_B32_e32: 1399 case AMDGPU::S_MOV_B32: 1400 break; 1401 } 1402 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1403 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1404 assert(ImmOp); 1405 // FIXME: We could handle FrameIndex values here. 1406 if (!ImmOp->isImm()) { 1407 return false; 1408 } 1409 UseMI.setDesc(get(NewOpc)); 1410 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1411 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1412 return true; 1413 } 1414 1415 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1416 // Don't fold if we are using source modifiers. The new VOP2 instructions 1417 // don't have them. 1418 if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || 1419 hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || 1420 hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { 1421 return false; 1422 } 1423 1424 const MachineOperand &ImmOp = DefMI.getOperand(1); 1425 1426 // If this is a free constant, there's no reason to do this. 1427 // TODO: We could fold this here instead of letting SIFoldOperands do it 1428 // later. 1429 if (isInlineConstant(ImmOp, 4)) 1430 return false; 1431 1432 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1433 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1434 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1435 1436 // Multiplied part is the constant: Use v_madmk_f32 1437 // We should only expect these to be on src0 due to canonicalizations. 1438 if (Src0->isReg() && Src0->getReg() == Reg) { 1439 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1440 return false; 1441 1442 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1443 return false; 1444 1445 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1446 1447 const int64_t Imm = DefMI.getOperand(1).getImm(); 1448 1449 // FIXME: This would be a lot easier if we could return a new instruction 1450 // instead of having to modify in place. 1451 1452 // Remove these first since they are at the end. 1453 UseMI.RemoveOperand( 1454 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1455 UseMI.RemoveOperand( 1456 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1457 1458 unsigned Src1Reg = Src1->getReg(); 1459 unsigned Src1SubReg = Src1->getSubReg(); 1460 Src0->setReg(Src1Reg); 1461 Src0->setSubReg(Src1SubReg); 1462 Src0->setIsKill(Src1->isKill()); 1463 1464 if (Opc == AMDGPU::V_MAC_F32_e64) { 1465 UseMI.untieRegOperand( 1466 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1467 } 1468 1469 Src1->ChangeToImmediate(Imm); 1470 1471 removeModOperands(UseMI); 1472 UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); 1473 1474 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1475 if (DeleteDef) 1476 DefMI.eraseFromParent(); 1477 1478 return true; 1479 } 1480 1481 // Added part is the constant: Use v_madak_f32 1482 if (Src2->isReg() && Src2->getReg() == Reg) { 1483 // Not allowed to use constant bus for another operand. 1484 // We can however allow an inline immediate as src0. 1485 if (!Src0->isImm() && 1486 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1487 return false; 1488 1489 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1490 return false; 1491 1492 const int64_t Imm = DefMI.getOperand(1).getImm(); 1493 1494 // FIXME: This would be a lot easier if we could return a new instruction 1495 // instead of having to modify in place. 1496 1497 // Remove these first since they are at the end. 1498 UseMI.RemoveOperand( 1499 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1500 UseMI.RemoveOperand( 1501 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1502 1503 if (Opc == AMDGPU::V_MAC_F32_e64) { 1504 UseMI.untieRegOperand( 1505 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1506 } 1507 1508 // ChangingToImmediate adds Src2 back to the instruction. 1509 Src2->ChangeToImmediate(Imm); 1510 1511 // These come before src2. 1512 removeModOperands(UseMI); 1513 UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); 1514 1515 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1516 if (DeleteDef) 1517 DefMI.eraseFromParent(); 1518 1519 return true; 1520 } 1521 } 1522 1523 return false; 1524 } 1525 1526 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1527 int WidthB, int OffsetB) { 1528 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1529 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1530 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1531 return LowOffset + LowWidth <= HighOffset; 1532 } 1533 1534 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1535 MachineInstr &MIb) const { 1536 unsigned BaseReg0, BaseReg1; 1537 int64_t Offset0, Offset1; 1538 1539 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1540 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1541 1542 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1543 // FIXME: Handle ds_read2 / ds_write2. 1544 return false; 1545 } 1546 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1547 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1548 if (BaseReg0 == BaseReg1 && 1549 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1550 return true; 1551 } 1552 } 1553 1554 return false; 1555 } 1556 1557 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1558 MachineInstr &MIb, 1559 AliasAnalysis *AA) const { 1560 assert((MIa.mayLoad() || MIa.mayStore()) && 1561 "MIa must load from or modify a memory location"); 1562 assert((MIb.mayLoad() || MIb.mayStore()) && 1563 "MIb must load from or modify a memory location"); 1564 1565 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1566 return false; 1567 1568 // XXX - Can we relax this between address spaces? 1569 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1570 return false; 1571 1572 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 1573 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 1574 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 1575 if (MMOa->getValue() && MMOb->getValue()) { 1576 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 1577 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 1578 if (!AA->alias(LocA, LocB)) 1579 return true; 1580 } 1581 } 1582 1583 // TODO: Should we check the address space from the MachineMemOperand? That 1584 // would allow us to distinguish objects we know don't alias based on the 1585 // underlying address space, even if it was lowered to a different one, 1586 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1587 // buffer. 1588 if (isDS(MIa)) { 1589 if (isDS(MIb)) 1590 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1591 1592 return !isFLAT(MIb); 1593 } 1594 1595 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1596 if (isMUBUF(MIb) || isMTBUF(MIb)) 1597 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1598 1599 return !isFLAT(MIb) && !isSMRD(MIb); 1600 } 1601 1602 if (isSMRD(MIa)) { 1603 if (isSMRD(MIb)) 1604 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1605 1606 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 1607 } 1608 1609 if (isFLAT(MIa)) { 1610 if (isFLAT(MIb)) 1611 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1612 1613 return false; 1614 } 1615 1616 return false; 1617 } 1618 1619 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1620 MachineInstr &MI, 1621 LiveVariables *LV) const { 1622 1623 switch (MI.getOpcode()) { 1624 default: 1625 return nullptr; 1626 case AMDGPU::V_MAC_F32_e64: 1627 break; 1628 case AMDGPU::V_MAC_F32_e32: { 1629 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1630 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1631 return nullptr; 1632 break; 1633 } 1634 } 1635 1636 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 1637 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1638 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 1639 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 1640 1641 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1642 .addOperand(*Dst) 1643 .addImm(0) // Src0 mods 1644 .addOperand(*Src0) 1645 .addImm(0) // Src1 mods 1646 .addOperand(*Src1) 1647 .addImm(0) // Src mods 1648 .addOperand(*Src2) 1649 .addImm(0) // clamp 1650 .addImm(0); // omod 1651 } 1652 1653 // It's not generally safe to move VALU instructions across these since it will 1654 // start using the register as a base index rather than directly. 1655 // XXX - Why isn't hasSideEffects sufficient for these? 1656 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 1657 switch (MI.getOpcode()) { 1658 case AMDGPU::S_SET_GPR_IDX_ON: 1659 case AMDGPU::S_SET_GPR_IDX_MODE: 1660 case AMDGPU::S_SET_GPR_IDX_OFF: 1661 return true; 1662 default: 1663 return false; 1664 } 1665 } 1666 1667 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1668 const MachineBasicBlock *MBB, 1669 const MachineFunction &MF) const { 1670 // XXX - Do we want the SP check in the base implementation? 1671 1672 // Target-independent instructions do not have an implicit-use of EXEC, even 1673 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1674 // boundaries prevents incorrect movements of such instructions. 1675 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 1676 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 1677 changesVGPRIndexingMode(MI); 1678 } 1679 1680 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1681 int64_t SVal = Imm.getSExtValue(); 1682 if (SVal >= -16 && SVal <= 64) 1683 return true; 1684 1685 if (Imm.getBitWidth() == 64) { 1686 uint64_t Val = Imm.getZExtValue(); 1687 return (DoubleToBits(0.0) == Val) || 1688 (DoubleToBits(1.0) == Val) || 1689 (DoubleToBits(-1.0) == Val) || 1690 (DoubleToBits(0.5) == Val) || 1691 (DoubleToBits(-0.5) == Val) || 1692 (DoubleToBits(2.0) == Val) || 1693 (DoubleToBits(-2.0) == Val) || 1694 (DoubleToBits(4.0) == Val) || 1695 (DoubleToBits(-4.0) == Val); 1696 } 1697 1698 // The actual type of the operand does not seem to matter as long 1699 // as the bits match one of the inline immediate values. For example: 1700 // 1701 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1702 // so it is a legal inline immediate. 1703 // 1704 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1705 // floating-point, so it is a legal inline immediate. 1706 uint32_t Val = Imm.getZExtValue(); 1707 1708 return (FloatToBits(0.0f) == Val) || 1709 (FloatToBits(1.0f) == Val) || 1710 (FloatToBits(-1.0f) == Val) || 1711 (FloatToBits(0.5f) == Val) || 1712 (FloatToBits(-0.5f) == Val) || 1713 (FloatToBits(2.0f) == Val) || 1714 (FloatToBits(-2.0f) == Val) || 1715 (FloatToBits(4.0f) == Val) || 1716 (FloatToBits(-4.0f) == Val); 1717 } 1718 1719 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1720 unsigned OpSize) const { 1721 if (MO.isImm()) { 1722 // MachineOperand provides no way to tell the true operand size, since it 1723 // only records a 64-bit value. We need to know the size to determine if a 1724 // 32-bit floating point immediate bit pattern is legal for an integer 1725 // immediate. It would be for any 32-bit integer operand, but would not be 1726 // for a 64-bit one. 1727 1728 unsigned BitSize = 8 * OpSize; 1729 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1730 } 1731 1732 return false; 1733 } 1734 1735 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1736 unsigned OpSize) const { 1737 return MO.isImm() && !isInlineConstant(MO, OpSize); 1738 } 1739 1740 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 1741 unsigned OpSize) const { 1742 switch (MO.getType()) { 1743 case MachineOperand::MO_Register: 1744 return false; 1745 case MachineOperand::MO_Immediate: 1746 return !isInlineConstant(MO, OpSize); 1747 case MachineOperand::MO_FrameIndex: 1748 case MachineOperand::MO_MachineBasicBlock: 1749 case MachineOperand::MO_ExternalSymbol: 1750 case MachineOperand::MO_GlobalAddress: 1751 case MachineOperand::MO_MCSymbol: 1752 return true; 1753 default: 1754 llvm_unreachable("unexpected operand type"); 1755 } 1756 } 1757 1758 static bool compareMachineOp(const MachineOperand &Op0, 1759 const MachineOperand &Op1) { 1760 if (Op0.getType() != Op1.getType()) 1761 return false; 1762 1763 switch (Op0.getType()) { 1764 case MachineOperand::MO_Register: 1765 return Op0.getReg() == Op1.getReg(); 1766 case MachineOperand::MO_Immediate: 1767 return Op0.getImm() == Op1.getImm(); 1768 default: 1769 llvm_unreachable("Didn't expect to be comparing these operand types"); 1770 } 1771 } 1772 1773 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 1774 const MachineOperand &MO) const { 1775 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 1776 1777 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1778 1779 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1780 return true; 1781 1782 if (OpInfo.RegClass < 0) 1783 return false; 1784 1785 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1786 if (isLiteralConstant(MO, OpSize)) 1787 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1788 1789 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1790 } 1791 1792 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1793 int Op32 = AMDGPU::getVOPe32(Opcode); 1794 if (Op32 == -1) 1795 return false; 1796 1797 return pseudoToMCOpcode(Op32) != -1; 1798 } 1799 1800 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1801 // The src0_modifier operand is present on all instructions 1802 // that have modifiers. 1803 1804 return AMDGPU::getNamedOperandIdx(Opcode, 1805 AMDGPU::OpName::src0_modifiers) != -1; 1806 } 1807 1808 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1809 unsigned OpName) const { 1810 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1811 return Mods && Mods->getImm(); 1812 } 1813 1814 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1815 const MachineOperand &MO, 1816 unsigned OpSize) const { 1817 // Literal constants use the constant bus. 1818 if (isLiteralConstant(MO, OpSize)) 1819 return true; 1820 1821 if (!MO.isReg() || !MO.isUse()) 1822 return false; 1823 1824 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1825 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1826 1827 // FLAT_SCR is just an SGPR pair. 1828 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1829 return true; 1830 1831 // EXEC register uses the constant bus. 1832 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1833 return true; 1834 1835 // SGPRs use the constant bus 1836 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1837 (!MO.isImplicit() && 1838 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1839 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1840 } 1841 1842 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1843 for (const MachineOperand &MO : MI.implicit_operands()) { 1844 // We only care about reads. 1845 if (MO.isDef()) 1846 continue; 1847 1848 switch (MO.getReg()) { 1849 case AMDGPU::VCC: 1850 case AMDGPU::M0: 1851 case AMDGPU::FLAT_SCR: 1852 return MO.getReg(); 1853 1854 default: 1855 break; 1856 } 1857 } 1858 1859 return AMDGPU::NoRegister; 1860 } 1861 1862 static bool shouldReadExec(const MachineInstr &MI) { 1863 if (SIInstrInfo::isVALU(MI)) { 1864 switch (MI.getOpcode()) { 1865 case AMDGPU::V_READLANE_B32: 1866 case AMDGPU::V_READLANE_B32_si: 1867 case AMDGPU::V_READLANE_B32_vi: 1868 case AMDGPU::V_WRITELANE_B32: 1869 case AMDGPU::V_WRITELANE_B32_si: 1870 case AMDGPU::V_WRITELANE_B32_vi: 1871 return false; 1872 } 1873 1874 return true; 1875 } 1876 1877 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 1878 SIInstrInfo::isSALU(MI) || 1879 SIInstrInfo::isSMRD(MI)) 1880 return false; 1881 1882 return true; 1883 } 1884 1885 static bool isSubRegOf(const SIRegisterInfo &TRI, 1886 const MachineOperand &SuperVec, 1887 const MachineOperand &SubReg) { 1888 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 1889 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 1890 1891 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 1892 SubReg.getReg() == SuperVec.getReg(); 1893 } 1894 1895 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 1896 StringRef &ErrInfo) const { 1897 uint16_t Opcode = MI.getOpcode(); 1898 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1899 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1900 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1901 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1902 1903 // Make sure the number of operands is correct. 1904 const MCInstrDesc &Desc = get(Opcode); 1905 if (!Desc.isVariadic() && 1906 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 1907 ErrInfo = "Instruction has wrong number of operands."; 1908 return false; 1909 } 1910 1911 // Make sure the register classes are correct. 1912 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1913 if (MI.getOperand(i).isFPImm()) { 1914 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1915 "all fp values to integers."; 1916 return false; 1917 } 1918 1919 int RegClass = Desc.OpInfo[i].RegClass; 1920 1921 switch (Desc.OpInfo[i].OperandType) { 1922 case MCOI::OPERAND_REGISTER: 1923 if (MI.getOperand(i).isImm()) { 1924 ErrInfo = "Illegal immediate value for operand."; 1925 return false; 1926 } 1927 break; 1928 case AMDGPU::OPERAND_REG_IMM32_INT: 1929 case AMDGPU::OPERAND_REG_IMM32_FP: 1930 break; 1931 case AMDGPU::OPERAND_REG_INLINE_C_INT: 1932 case AMDGPU::OPERAND_REG_INLINE_C_FP: 1933 if (isLiteralConstant(MI.getOperand(i), 1934 RI.getRegClass(RegClass)->getSize())) { 1935 ErrInfo = "Illegal immediate value for operand."; 1936 return false; 1937 } 1938 break; 1939 case MCOI::OPERAND_IMMEDIATE: 1940 case AMDGPU::OPERAND_KIMM32: 1941 // Check if this operand is an immediate. 1942 // FrameIndex operands will be replaced by immediates, so they are 1943 // allowed. 1944 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 1945 ErrInfo = "Expected immediate, but got non-immediate"; 1946 return false; 1947 } 1948 LLVM_FALLTHROUGH; 1949 default: 1950 continue; 1951 } 1952 1953 if (!MI.getOperand(i).isReg()) 1954 continue; 1955 1956 if (RegClass != -1) { 1957 unsigned Reg = MI.getOperand(i).getReg(); 1958 if (Reg == AMDGPU::NoRegister || 1959 TargetRegisterInfo::isVirtualRegister(Reg)) 1960 continue; 1961 1962 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1963 if (!RC->contains(Reg)) { 1964 ErrInfo = "Operand has incorrect register class."; 1965 return false; 1966 } 1967 } 1968 } 1969 1970 // Verify VOP* 1971 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 1972 // Only look at the true operands. Only a real operand can use the constant 1973 // bus, and we don't want to check pseudo-operands like the source modifier 1974 // flags. 1975 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1976 1977 unsigned ConstantBusCount = 0; 1978 1979 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 1980 ++ConstantBusCount; 1981 1982 unsigned SGPRUsed = findImplicitSGPRRead(MI); 1983 if (SGPRUsed != AMDGPU::NoRegister) 1984 ++ConstantBusCount; 1985 1986 for (int OpIdx : OpIndices) { 1987 if (OpIdx == -1) 1988 break; 1989 const MachineOperand &MO = MI.getOperand(OpIdx); 1990 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1991 if (MO.isReg()) { 1992 if (MO.getReg() != SGPRUsed) 1993 ++ConstantBusCount; 1994 SGPRUsed = MO.getReg(); 1995 } else { 1996 ++ConstantBusCount; 1997 } 1998 } 1999 } 2000 if (ConstantBusCount > 1) { 2001 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2002 return false; 2003 } 2004 } 2005 2006 // Verify misc. restrictions on specific instructions. 2007 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2008 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2009 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2010 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2011 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2012 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2013 if (!compareMachineOp(Src0, Src1) && 2014 !compareMachineOp(Src0, Src2)) { 2015 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2016 return false; 2017 } 2018 } 2019 } 2020 2021 if (isSOPK(MI)) { 2022 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2023 if (sopkIsZext(MI)) { 2024 if (!isUInt<16>(Imm)) { 2025 ErrInfo = "invalid immediate for SOPK instruction"; 2026 return false; 2027 } 2028 } else { 2029 if (!isInt<16>(Imm)) { 2030 ErrInfo = "invalid immediate for SOPK instruction"; 2031 return false; 2032 } 2033 } 2034 } 2035 2036 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2037 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2038 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2039 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2040 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2041 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2042 2043 const unsigned StaticNumOps = Desc.getNumOperands() + 2044 Desc.getNumImplicitUses(); 2045 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2046 2047 if (MI.getNumOperands() != StaticNumOps + NumImplicitOps) { 2048 ErrInfo = "missing implicit register operands"; 2049 return false; 2050 } 2051 2052 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2053 if (IsDst) { 2054 if (!Dst->isUse()) { 2055 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2056 return false; 2057 } 2058 2059 unsigned UseOpIdx; 2060 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2061 UseOpIdx != StaticNumOps + 1) { 2062 ErrInfo = "movrel implicit operands should be tied"; 2063 return false; 2064 } 2065 } 2066 2067 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2068 const MachineOperand &ImpUse 2069 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2070 if (!ImpUse.isReg() || !ImpUse.isUse() || 2071 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2072 ErrInfo = "src0 should be subreg of implicit vector use"; 2073 return false; 2074 } 2075 } 2076 2077 // Make sure we aren't losing exec uses in the td files. This mostly requires 2078 // being careful when using let Uses to try to add other use registers. 2079 if (shouldReadExec(MI)) { 2080 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2081 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2082 return false; 2083 } 2084 } 2085 2086 return true; 2087 } 2088 2089 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2090 switch (MI.getOpcode()) { 2091 default: return AMDGPU::INSTRUCTION_LIST_END; 2092 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2093 case AMDGPU::COPY: return AMDGPU::COPY; 2094 case AMDGPU::PHI: return AMDGPU::PHI; 2095 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2096 case AMDGPU::S_MOV_B32: 2097 return MI.getOperand(1).isReg() ? 2098 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2099 case AMDGPU::S_ADD_I32: 2100 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2101 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2102 case AMDGPU::S_SUB_I32: 2103 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2104 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2105 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2106 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2107 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2108 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2109 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2110 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2111 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2112 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2113 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2114 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2115 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2116 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2117 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2118 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2119 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2120 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2121 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2122 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2123 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2124 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2125 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2126 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2127 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2128 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2129 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2130 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2131 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2132 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2133 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2134 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2135 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2136 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2137 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2138 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2139 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2140 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2141 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2142 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2143 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2144 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2145 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2146 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2147 } 2148 } 2149 2150 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2151 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2152 } 2153 2154 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2155 unsigned OpNo) const { 2156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2157 const MCInstrDesc &Desc = get(MI.getOpcode()); 2158 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2159 Desc.OpInfo[OpNo].RegClass == -1) { 2160 unsigned Reg = MI.getOperand(OpNo).getReg(); 2161 2162 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2163 return MRI.getRegClass(Reg); 2164 return RI.getPhysRegClass(Reg); 2165 } 2166 2167 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2168 return RI.getRegClass(RCID); 2169 } 2170 2171 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2172 switch (MI.getOpcode()) { 2173 case AMDGPU::COPY: 2174 case AMDGPU::REG_SEQUENCE: 2175 case AMDGPU::PHI: 2176 case AMDGPU::INSERT_SUBREG: 2177 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2178 default: 2179 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2180 } 2181 } 2182 2183 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2184 MachineBasicBlock::iterator I = MI; 2185 MachineBasicBlock *MBB = MI.getParent(); 2186 MachineOperand &MO = MI.getOperand(OpIdx); 2187 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2188 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2189 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2190 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2191 if (MO.isReg()) 2192 Opcode = AMDGPU::COPY; 2193 else if (RI.isSGPRClass(RC)) 2194 Opcode = AMDGPU::S_MOV_B32; 2195 2196 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2197 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2198 VRC = &AMDGPU::VReg_64RegClass; 2199 else 2200 VRC = &AMDGPU::VGPR_32RegClass; 2201 2202 unsigned Reg = MRI.createVirtualRegister(VRC); 2203 DebugLoc DL = MBB->findDebugLoc(I); 2204 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); 2205 MO.ChangeToRegister(Reg, false); 2206 } 2207 2208 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2209 MachineRegisterInfo &MRI, 2210 MachineOperand &SuperReg, 2211 const TargetRegisterClass *SuperRC, 2212 unsigned SubIdx, 2213 const TargetRegisterClass *SubRC) 2214 const { 2215 MachineBasicBlock *MBB = MI->getParent(); 2216 DebugLoc DL = MI->getDebugLoc(); 2217 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2218 2219 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2220 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2221 .addReg(SuperReg.getReg(), 0, SubIdx); 2222 return SubReg; 2223 } 2224 2225 // Just in case the super register is itself a sub-register, copy it to a new 2226 // value so we don't need to worry about merging its subreg index with the 2227 // SubIdx passed to this function. The register coalescer should be able to 2228 // eliminate this extra copy. 2229 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2230 2231 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2232 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2233 2234 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2235 .addReg(NewSuperReg, 0, SubIdx); 2236 2237 return SubReg; 2238 } 2239 2240 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2241 MachineBasicBlock::iterator MII, 2242 MachineRegisterInfo &MRI, 2243 MachineOperand &Op, 2244 const TargetRegisterClass *SuperRC, 2245 unsigned SubIdx, 2246 const TargetRegisterClass *SubRC) const { 2247 if (Op.isImm()) { 2248 if (SubIdx == AMDGPU::sub0) 2249 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2250 if (SubIdx == AMDGPU::sub1) 2251 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2252 2253 llvm_unreachable("Unhandled register index for immediate"); 2254 } 2255 2256 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2257 SubIdx, SubRC); 2258 return MachineOperand::CreateReg(SubReg, false); 2259 } 2260 2261 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2262 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2263 assert(Inst.getNumExplicitOperands() == 3); 2264 MachineOperand Op1 = Inst.getOperand(1); 2265 Inst.RemoveOperand(1); 2266 Inst.addOperand(Op1); 2267 } 2268 2269 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2270 const MCOperandInfo &OpInfo, 2271 const MachineOperand &MO) const { 2272 if (!MO.isReg()) 2273 return false; 2274 2275 unsigned Reg = MO.getReg(); 2276 const TargetRegisterClass *RC = 2277 TargetRegisterInfo::isVirtualRegister(Reg) ? 2278 MRI.getRegClass(Reg) : 2279 RI.getPhysRegClass(Reg); 2280 2281 const SIRegisterInfo *TRI = 2282 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2283 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2284 2285 // In order to be legal, the common sub-class must be equal to the 2286 // class of the current operand. For example: 2287 // 2288 // v_mov_b32 s0 ; Operand defined as vsrc_b32 2289 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 2290 // 2291 // s_sendmsg 0, s0 ; Operand defined as m0reg 2292 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2293 2294 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2295 } 2296 2297 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2298 const MCOperandInfo &OpInfo, 2299 const MachineOperand &MO) const { 2300 if (MO.isReg()) 2301 return isLegalRegOperand(MRI, OpInfo, MO); 2302 2303 // Handle non-register types that are treated like immediates. 2304 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2305 return true; 2306 } 2307 2308 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2309 const MachineOperand *MO) const { 2310 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2311 const MCInstrDesc &InstDesc = MI.getDesc(); 2312 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2313 const TargetRegisterClass *DefinedRC = 2314 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2315 if (!MO) 2316 MO = &MI.getOperand(OpIdx); 2317 2318 if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 2319 2320 RegSubRegPair SGPRUsed; 2321 if (MO->isReg()) 2322 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2323 2324 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2325 if (i == OpIdx) 2326 continue; 2327 const MachineOperand &Op = MI.getOperand(i); 2328 if (Op.isReg()) { 2329 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2330 usesConstantBus(MRI, Op, getOpSize(MI, i))) { 2331 return false; 2332 } 2333 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2334 return false; 2335 } 2336 } 2337 } 2338 2339 if (MO->isReg()) { 2340 assert(DefinedRC); 2341 return isLegalRegOperand(MRI, OpInfo, *MO); 2342 } 2343 2344 // Handle non-register types that are treated like immediates. 2345 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2346 2347 if (!DefinedRC) { 2348 // This operand expects an immediate. 2349 return true; 2350 } 2351 2352 return isImmOperandLegal(MI, OpIdx, *MO); 2353 } 2354 2355 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2356 MachineInstr &MI) const { 2357 unsigned Opc = MI.getOpcode(); 2358 const MCInstrDesc &InstrDesc = get(Opc); 2359 2360 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2361 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2362 2363 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2364 // we need to only have one constant bus use. 2365 // 2366 // Note we do not need to worry about literal constants here. They are 2367 // disabled for the operand type for instructions because they will always 2368 // violate the one constant bus use rule. 2369 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2370 if (HasImplicitSGPR) { 2371 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2372 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2373 2374 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2375 legalizeOpWithMove(MI, Src0Idx); 2376 } 2377 2378 // VOP2 src0 instructions support all operand types, so we don't need to check 2379 // their legality. If src1 is already legal, we don't need to do anything. 2380 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2381 return; 2382 2383 // We do not use commuteInstruction here because it is too aggressive and will 2384 // commute if it is possible. We only want to commute here if it improves 2385 // legality. This can be called a fairly large number of times so don't waste 2386 // compile time pointlessly swapping and checking legality again. 2387 if (HasImplicitSGPR || !MI.isCommutable()) { 2388 legalizeOpWithMove(MI, Src1Idx); 2389 return; 2390 } 2391 2392 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2393 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2394 2395 // If src0 can be used as src1, commuting will make the operands legal. 2396 // Otherwise we have to give up and insert a move. 2397 // 2398 // TODO: Other immediate-like operand kinds could be commuted if there was a 2399 // MachineOperand::ChangeTo* for them. 2400 if ((!Src1.isImm() && !Src1.isReg()) || 2401 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2402 legalizeOpWithMove(MI, Src1Idx); 2403 return; 2404 } 2405 2406 int CommutedOpc = commuteOpcode(MI); 2407 if (CommutedOpc == -1) { 2408 legalizeOpWithMove(MI, Src1Idx); 2409 return; 2410 } 2411 2412 MI.setDesc(get(CommutedOpc)); 2413 2414 unsigned Src0Reg = Src0.getReg(); 2415 unsigned Src0SubReg = Src0.getSubReg(); 2416 bool Src0Kill = Src0.isKill(); 2417 2418 if (Src1.isImm()) 2419 Src0.ChangeToImmediate(Src1.getImm()); 2420 else if (Src1.isReg()) { 2421 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2422 Src0.setSubReg(Src1.getSubReg()); 2423 } else 2424 llvm_unreachable("Should only have register or immediate operands"); 2425 2426 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2427 Src1.setSubReg(Src0SubReg); 2428 } 2429 2430 // Legalize VOP3 operands. Because all operand types are supported for any 2431 // operand, and since literal constants are not allowed and should never be 2432 // seen, we only need to worry about inserting copies if we use multiple SGPR 2433 // operands. 2434 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2435 MachineInstr &MI) const { 2436 unsigned Opc = MI.getOpcode(); 2437 2438 int VOP3Idx[3] = { 2439 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2440 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2441 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2442 }; 2443 2444 // Find the one SGPR operand we are allowed to use. 2445 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2446 2447 for (unsigned i = 0; i < 3; ++i) { 2448 int Idx = VOP3Idx[i]; 2449 if (Idx == -1) 2450 break; 2451 MachineOperand &MO = MI.getOperand(Idx); 2452 2453 // We should never see a VOP3 instruction with an illegal immediate operand. 2454 if (!MO.isReg()) 2455 continue; 2456 2457 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2458 continue; // VGPRs are legal 2459 2460 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2461 SGPRReg = MO.getReg(); 2462 // We can use one SGPR in each VOP3 instruction. 2463 continue; 2464 } 2465 2466 // If we make it this far, then the operand is not legal and we must 2467 // legalize it. 2468 legalizeOpWithMove(MI, Idx); 2469 } 2470 } 2471 2472 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2473 MachineRegisterInfo &MRI) const { 2474 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2475 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2476 unsigned DstReg = MRI.createVirtualRegister(SRC); 2477 unsigned SubRegs = VRC->getSize() / 4; 2478 2479 SmallVector<unsigned, 8> SRegs; 2480 for (unsigned i = 0; i < SubRegs; ++i) { 2481 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2482 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2483 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2484 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2485 SRegs.push_back(SGPR); 2486 } 2487 2488 MachineInstrBuilder MIB = 2489 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2490 get(AMDGPU::REG_SEQUENCE), DstReg); 2491 for (unsigned i = 0; i < SubRegs; ++i) { 2492 MIB.addReg(SRegs[i]); 2493 MIB.addImm(RI.getSubRegFromChannel(i)); 2494 } 2495 return DstReg; 2496 } 2497 2498 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2499 MachineInstr &MI) const { 2500 2501 // If the pointer is store in VGPRs, then we need to move them to 2502 // SGPRs using v_readfirstlane. This is safe because we only select 2503 // loads with uniform pointers to SMRD instruction so we know the 2504 // pointer value is uniform. 2505 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 2506 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2507 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2508 SBase->setReg(SGPR); 2509 } 2510 } 2511 2512 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 2513 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2514 2515 // Legalize VOP2 2516 if (isVOP2(MI) || isVOPC(MI)) { 2517 legalizeOperandsVOP2(MRI, MI); 2518 return; 2519 } 2520 2521 // Legalize VOP3 2522 if (isVOP3(MI)) { 2523 legalizeOperandsVOP3(MRI, MI); 2524 return; 2525 } 2526 2527 // Legalize SMRD 2528 if (isSMRD(MI)) { 2529 legalizeOperandsSMRD(MRI, MI); 2530 return; 2531 } 2532 2533 // Legalize REG_SEQUENCE and PHI 2534 // The register class of the operands much be the same type as the register 2535 // class of the output. 2536 if (MI.getOpcode() == AMDGPU::PHI) { 2537 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2538 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 2539 if (!MI.getOperand(i).isReg() || 2540 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 2541 continue; 2542 const TargetRegisterClass *OpRC = 2543 MRI.getRegClass(MI.getOperand(i).getReg()); 2544 if (RI.hasVGPRs(OpRC)) { 2545 VRC = OpRC; 2546 } else { 2547 SRC = OpRC; 2548 } 2549 } 2550 2551 // If any of the operands are VGPR registers, then they all most be 2552 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2553 // them. 2554 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 2555 if (!VRC) { 2556 assert(SRC); 2557 VRC = RI.getEquivalentVGPRClass(SRC); 2558 } 2559 RC = VRC; 2560 } else { 2561 RC = SRC; 2562 } 2563 2564 // Update all the operands so they have the same type. 2565 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2566 MachineOperand &Op = MI.getOperand(I); 2567 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2568 continue; 2569 unsigned DstReg = MRI.createVirtualRegister(RC); 2570 2571 // MI is a PHI instruction. 2572 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 2573 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2574 2575 BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) 2576 .addOperand(Op); 2577 Op.setReg(DstReg); 2578 } 2579 } 2580 2581 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2582 // VGPR dest type and SGPR sources, insert copies so all operands are 2583 // VGPRs. This seems to help operand folding / the register coalescer. 2584 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 2585 MachineBasicBlock *MBB = MI.getParent(); 2586 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 2587 if (RI.hasVGPRs(DstRC)) { 2588 // Update all the operands so they are VGPR register classes. These may 2589 // not be the same register class because REG_SEQUENCE supports mixing 2590 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2591 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2592 MachineOperand &Op = MI.getOperand(I); 2593 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2594 continue; 2595 2596 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2597 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2598 if (VRC == OpRC) 2599 continue; 2600 2601 unsigned DstReg = MRI.createVirtualRegister(VRC); 2602 2603 BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) 2604 .addOperand(Op); 2605 2606 Op.setReg(DstReg); 2607 Op.setIsKill(); 2608 } 2609 } 2610 2611 return; 2612 } 2613 2614 // Legalize INSERT_SUBREG 2615 // src0 must have the same register class as dst 2616 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 2617 unsigned Dst = MI.getOperand(0).getReg(); 2618 unsigned Src0 = MI.getOperand(1).getReg(); 2619 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2620 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2621 if (DstRC != Src0RC) { 2622 MachineBasicBlock &MBB = *MI.getParent(); 2623 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2624 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2625 .addReg(Src0); 2626 MI.getOperand(1).setReg(NewSrc0); 2627 } 2628 return; 2629 } 2630 2631 // Legalize MIMG 2632 if (isMIMG(MI)) { 2633 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 2634 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2635 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2636 SRsrc->setReg(SGPR); 2637 } 2638 2639 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 2640 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2641 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2642 SSamp->setReg(SGPR); 2643 } 2644 return; 2645 } 2646 2647 // Legalize MUBUF* instructions 2648 // FIXME: If we start using the non-addr64 instructions for compute, we 2649 // may need to legalize them here. 2650 int SRsrcIdx = 2651 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 2652 if (SRsrcIdx != -1) { 2653 // We have an MUBUF instruction 2654 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 2655 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 2656 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2657 RI.getRegClass(SRsrcRC))) { 2658 // The operands are legal. 2659 // FIXME: We may need to legalize operands besided srsrc. 2660 return; 2661 } 2662 2663 MachineBasicBlock &MBB = *MI.getParent(); 2664 2665 // Extract the ptr from the resource descriptor. 2666 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2667 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2668 2669 // Create an empty resource descriptor 2670 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2671 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2672 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2673 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2674 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2675 2676 // Zero64 = 0 2677 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 2678 .addImm(0); 2679 2680 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2681 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 2682 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2683 2684 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2685 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 2686 .addImm(RsrcDataFormat >> 32); 2687 2688 // NewSRsrc = {Zero64, SRsrcFormat} 2689 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2690 .addReg(Zero64) 2691 .addImm(AMDGPU::sub0_sub1) 2692 .addReg(SRsrcFormatLo) 2693 .addImm(AMDGPU::sub2) 2694 .addReg(SRsrcFormatHi) 2695 .addImm(AMDGPU::sub3); 2696 2697 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 2698 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2699 if (VAddr) { 2700 // This is already an ADDR64 instruction so we need to add the pointer 2701 // extracted from the resource descriptor to the current value of VAddr. 2702 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2703 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2704 2705 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2706 DebugLoc DL = MI.getDebugLoc(); 2707 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2708 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2709 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2710 2711 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2712 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2713 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2714 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2715 2716 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2717 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2718 .addReg(NewVAddrLo) 2719 .addImm(AMDGPU::sub0) 2720 .addReg(NewVAddrHi) 2721 .addImm(AMDGPU::sub1); 2722 } else { 2723 // This instructions is the _OFFSET variant, so we need to convert it to 2724 // ADDR64. 2725 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 2726 < SISubtarget::VOLCANIC_ISLANDS && 2727 "FIXME: Need to emit flat atomics here"); 2728 2729 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 2730 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 2731 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 2732 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 2733 2734 // Atomics rith return have have an additional tied operand and are 2735 // missing some of the special bits. 2736 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 2737 MachineInstr *Addr64; 2738 2739 if (!VDataIn) { 2740 // Regular buffer load / store. 2741 MachineInstrBuilder MIB = 2742 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2743 .addOperand(*VData) 2744 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2745 // This will be replaced later 2746 // with the new value of vaddr. 2747 .addOperand(*SRsrc) 2748 .addOperand(*SOffset) 2749 .addOperand(*Offset); 2750 2751 // Atomics do not have this operand. 2752 if (const MachineOperand *GLC = 2753 getNamedOperand(MI, AMDGPU::OpName::glc)) { 2754 MIB.addImm(GLC->getImm()); 2755 } 2756 2757 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 2758 2759 if (const MachineOperand *TFE = 2760 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 2761 MIB.addImm(TFE->getImm()); 2762 } 2763 2764 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2765 Addr64 = MIB; 2766 } else { 2767 // Atomics with return. 2768 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2769 .addOperand(*VData) 2770 .addOperand(*VDataIn) 2771 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2772 // This will be replaced later 2773 // with the new value of vaddr. 2774 .addOperand(*SRsrc) 2775 .addOperand(*SOffset) 2776 .addOperand(*Offset) 2777 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 2778 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2779 } 2780 2781 MI.removeFromParent(); 2782 2783 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2784 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 2785 NewVAddr) 2786 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2787 .addImm(AMDGPU::sub0) 2788 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2789 .addImm(AMDGPU::sub1); 2790 2791 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 2792 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 2793 } 2794 2795 // Update the instruction to use NewVaddr 2796 VAddr->setReg(NewVAddr); 2797 // Update the instruction to use NewSRsrc 2798 SRsrc->setReg(NewSRsrc); 2799 } 2800 } 2801 2802 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2803 SmallVector<MachineInstr *, 128> Worklist; 2804 Worklist.push_back(&TopInst); 2805 2806 while (!Worklist.empty()) { 2807 MachineInstr &Inst = *Worklist.pop_back_val(); 2808 MachineBasicBlock *MBB = Inst.getParent(); 2809 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2810 2811 unsigned Opcode = Inst.getOpcode(); 2812 unsigned NewOpcode = getVALUOp(Inst); 2813 2814 // Handle some special cases 2815 switch (Opcode) { 2816 default: 2817 break; 2818 case AMDGPU::S_AND_B64: 2819 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2820 Inst.eraseFromParent(); 2821 continue; 2822 2823 case AMDGPU::S_OR_B64: 2824 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2825 Inst.eraseFromParent(); 2826 continue; 2827 2828 case AMDGPU::S_XOR_B64: 2829 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2830 Inst.eraseFromParent(); 2831 continue; 2832 2833 case AMDGPU::S_NOT_B64: 2834 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2835 Inst.eraseFromParent(); 2836 continue; 2837 2838 case AMDGPU::S_BCNT1_I32_B64: 2839 splitScalar64BitBCNT(Worklist, Inst); 2840 Inst.eraseFromParent(); 2841 continue; 2842 2843 case AMDGPU::S_BFE_I64: { 2844 splitScalar64BitBFE(Worklist, Inst); 2845 Inst.eraseFromParent(); 2846 continue; 2847 } 2848 2849 case AMDGPU::S_LSHL_B32: 2850 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2851 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2852 swapOperands(Inst); 2853 } 2854 break; 2855 case AMDGPU::S_ASHR_I32: 2856 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2857 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2858 swapOperands(Inst); 2859 } 2860 break; 2861 case AMDGPU::S_LSHR_B32: 2862 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2863 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2864 swapOperands(Inst); 2865 } 2866 break; 2867 case AMDGPU::S_LSHL_B64: 2868 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2869 NewOpcode = AMDGPU::V_LSHLREV_B64; 2870 swapOperands(Inst); 2871 } 2872 break; 2873 case AMDGPU::S_ASHR_I64: 2874 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2875 NewOpcode = AMDGPU::V_ASHRREV_I64; 2876 swapOperands(Inst); 2877 } 2878 break; 2879 case AMDGPU::S_LSHR_B64: 2880 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2881 NewOpcode = AMDGPU::V_LSHRREV_B64; 2882 swapOperands(Inst); 2883 } 2884 break; 2885 2886 case AMDGPU::S_ABS_I32: 2887 lowerScalarAbs(Worklist, Inst); 2888 Inst.eraseFromParent(); 2889 continue; 2890 2891 case AMDGPU::S_CBRANCH_SCC0: 2892 case AMDGPU::S_CBRANCH_SCC1: 2893 // Clear unused bits of vcc 2894 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 2895 AMDGPU::VCC) 2896 .addReg(AMDGPU::EXEC) 2897 .addReg(AMDGPU::VCC); 2898 break; 2899 2900 case AMDGPU::S_BFE_U64: 2901 case AMDGPU::S_BFM_B64: 2902 llvm_unreachable("Moving this op to VALU not implemented"); 2903 } 2904 2905 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2906 // We cannot move this instruction to the VALU, so we should try to 2907 // legalize its operands instead. 2908 legalizeOperands(Inst); 2909 continue; 2910 } 2911 2912 // Use the new VALU Opcode. 2913 const MCInstrDesc &NewDesc = get(NewOpcode); 2914 Inst.setDesc(NewDesc); 2915 2916 // Remove any references to SCC. Vector instructions can't read from it, and 2917 // We're just about to add the implicit use / defs of VCC, and we don't want 2918 // both. 2919 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 2920 MachineOperand &Op = Inst.getOperand(i); 2921 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2922 Inst.RemoveOperand(i); 2923 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2924 } 2925 } 2926 2927 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2928 // We are converting these to a BFE, so we need to add the missing 2929 // operands for the size and offset. 2930 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2931 Inst.addOperand(MachineOperand::CreateImm(0)); 2932 Inst.addOperand(MachineOperand::CreateImm(Size)); 2933 2934 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2935 // The VALU version adds the second operand to the result, so insert an 2936 // extra 0 operand. 2937 Inst.addOperand(MachineOperand::CreateImm(0)); 2938 } 2939 2940 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 2941 2942 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2943 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 2944 // If we need to move this to VGPRs, we need to unpack the second operand 2945 // back into the 2 separate ones for bit offset and width. 2946 assert(OffsetWidthOp.isImm() && 2947 "Scalar BFE is only implemented for constant width and offset"); 2948 uint32_t Imm = OffsetWidthOp.getImm(); 2949 2950 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2951 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2952 Inst.RemoveOperand(2); // Remove old immediate. 2953 Inst.addOperand(MachineOperand::CreateImm(Offset)); 2954 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 2955 } 2956 2957 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 2958 unsigned NewDstReg = AMDGPU::NoRegister; 2959 if (HasDst) { 2960 // Update the destination register class. 2961 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 2962 if (!NewDstRC) 2963 continue; 2964 2965 unsigned DstReg = Inst.getOperand(0).getReg(); 2966 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2967 MRI.replaceRegWith(DstReg, NewDstReg); 2968 } 2969 2970 // Legalize the operands 2971 legalizeOperands(Inst); 2972 2973 if (HasDst) 2974 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2975 } 2976 } 2977 2978 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2979 MachineInstr &Inst) const { 2980 MachineBasicBlock &MBB = *Inst.getParent(); 2981 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2982 MachineBasicBlock::iterator MII = Inst; 2983 DebugLoc DL = Inst.getDebugLoc(); 2984 2985 MachineOperand &Dest = Inst.getOperand(0); 2986 MachineOperand &Src = Inst.getOperand(1); 2987 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2988 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2989 2990 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2991 .addImm(0) 2992 .addReg(Src.getReg()); 2993 2994 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2995 .addReg(Src.getReg()) 2996 .addReg(TmpReg); 2997 2998 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2999 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3000 } 3001 3002 void SIInstrInfo::splitScalar64BitUnaryOp( 3003 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3004 unsigned Opcode) const { 3005 MachineBasicBlock &MBB = *Inst.getParent(); 3006 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3007 3008 MachineOperand &Dest = Inst.getOperand(0); 3009 MachineOperand &Src0 = Inst.getOperand(1); 3010 DebugLoc DL = Inst.getDebugLoc(); 3011 3012 MachineBasicBlock::iterator MII = Inst; 3013 3014 const MCInstrDesc &InstDesc = get(Opcode); 3015 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3016 MRI.getRegClass(Src0.getReg()) : 3017 &AMDGPU::SGPR_32RegClass; 3018 3019 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3020 3021 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3022 AMDGPU::sub0, Src0SubRC); 3023 3024 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3025 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3026 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3027 3028 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3029 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3030 .addOperand(SrcReg0Sub0); 3031 3032 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3033 AMDGPU::sub1, Src0SubRC); 3034 3035 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3036 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3037 .addOperand(SrcReg0Sub1); 3038 3039 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3040 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3041 .addReg(DestSub0) 3042 .addImm(AMDGPU::sub0) 3043 .addReg(DestSub1) 3044 .addImm(AMDGPU::sub1); 3045 3046 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3047 3048 // We don't need to legalizeOperands here because for a single operand, src0 3049 // will support any kind of input. 3050 3051 // Move all users of this moved value. 3052 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3053 } 3054 3055 void SIInstrInfo::splitScalar64BitBinaryOp( 3056 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3057 unsigned Opcode) const { 3058 MachineBasicBlock &MBB = *Inst.getParent(); 3059 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3060 3061 MachineOperand &Dest = Inst.getOperand(0); 3062 MachineOperand &Src0 = Inst.getOperand(1); 3063 MachineOperand &Src1 = Inst.getOperand(2); 3064 DebugLoc DL = Inst.getDebugLoc(); 3065 3066 MachineBasicBlock::iterator MII = Inst; 3067 3068 const MCInstrDesc &InstDesc = get(Opcode); 3069 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3070 MRI.getRegClass(Src0.getReg()) : 3071 &AMDGPU::SGPR_32RegClass; 3072 3073 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3074 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3075 MRI.getRegClass(Src1.getReg()) : 3076 &AMDGPU::SGPR_32RegClass; 3077 3078 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3079 3080 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3081 AMDGPU::sub0, Src0SubRC); 3082 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3083 AMDGPU::sub0, Src1SubRC); 3084 3085 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3086 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3087 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3088 3089 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3090 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3091 .addOperand(SrcReg0Sub0) 3092 .addOperand(SrcReg1Sub0); 3093 3094 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3095 AMDGPU::sub1, Src0SubRC); 3096 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3097 AMDGPU::sub1, Src1SubRC); 3098 3099 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3100 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3101 .addOperand(SrcReg0Sub1) 3102 .addOperand(SrcReg1Sub1); 3103 3104 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3105 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3106 .addReg(DestSub0) 3107 .addImm(AMDGPU::sub0) 3108 .addReg(DestSub1) 3109 .addImm(AMDGPU::sub1); 3110 3111 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3112 3113 // Try to legalize the operands in case we need to swap the order to keep it 3114 // valid. 3115 legalizeOperands(LoHalf); 3116 legalizeOperands(HiHalf); 3117 3118 // Move all users of this moved vlaue. 3119 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3120 } 3121 3122 void SIInstrInfo::splitScalar64BitBCNT( 3123 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 3124 MachineBasicBlock &MBB = *Inst.getParent(); 3125 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3126 3127 MachineBasicBlock::iterator MII = Inst; 3128 DebugLoc DL = Inst.getDebugLoc(); 3129 3130 MachineOperand &Dest = Inst.getOperand(0); 3131 MachineOperand &Src = Inst.getOperand(1); 3132 3133 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3134 const TargetRegisterClass *SrcRC = Src.isReg() ? 3135 MRI.getRegClass(Src.getReg()) : 3136 &AMDGPU::SGPR_32RegClass; 3137 3138 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3139 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3140 3141 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3142 3143 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3144 AMDGPU::sub0, SrcSubRC); 3145 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3146 AMDGPU::sub1, SrcSubRC); 3147 3148 BuildMI(MBB, MII, DL, InstDesc, MidReg) 3149 .addOperand(SrcRegSub0) 3150 .addImm(0); 3151 3152 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 3153 .addOperand(SrcRegSub1) 3154 .addReg(MidReg); 3155 3156 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3157 3158 // We don't need to legalize operands here. src0 for etiher instruction can be 3159 // an SGPR, and the second input is unused or determined here. 3160 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3161 } 3162 3163 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 3164 MachineInstr &Inst) const { 3165 MachineBasicBlock &MBB = *Inst.getParent(); 3166 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3167 MachineBasicBlock::iterator MII = Inst; 3168 DebugLoc DL = Inst.getDebugLoc(); 3169 3170 MachineOperand &Dest = Inst.getOperand(0); 3171 uint32_t Imm = Inst.getOperand(2).getImm(); 3172 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3173 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3174 3175 (void) Offset; 3176 3177 // Only sext_inreg cases handled. 3178 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3179 Offset == 0 && "Not implemented"); 3180 3181 if (BitWidth < 32) { 3182 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3183 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3184 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3185 3186 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3187 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3188 .addImm(0) 3189 .addImm(BitWidth); 3190 3191 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3192 .addImm(31) 3193 .addReg(MidRegLo); 3194 3195 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3196 .addReg(MidRegLo) 3197 .addImm(AMDGPU::sub0) 3198 .addReg(MidRegHi) 3199 .addImm(AMDGPU::sub1); 3200 3201 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3202 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3203 return; 3204 } 3205 3206 MachineOperand &Src = Inst.getOperand(1); 3207 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3208 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3209 3210 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 3211 .addImm(31) 3212 .addReg(Src.getReg(), 0, AMDGPU::sub0); 3213 3214 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3215 .addReg(Src.getReg(), 0, AMDGPU::sub0) 3216 .addImm(AMDGPU::sub0) 3217 .addReg(TmpReg) 3218 .addImm(AMDGPU::sub1); 3219 3220 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3221 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3222 } 3223 3224 void SIInstrInfo::addUsersToMoveToVALUWorklist( 3225 unsigned DstReg, 3226 MachineRegisterInfo &MRI, 3227 SmallVectorImpl<MachineInstr *> &Worklist) const { 3228 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 3229 E = MRI.use_end(); I != E; ++I) { 3230 MachineInstr &UseMI = *I->getParent(); 3231 if (!canReadVGPR(UseMI, I.getOperandNo())) { 3232 Worklist.push_back(&UseMI); 3233 } 3234 } 3235 } 3236 3237 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 3238 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 3239 // This assumes that all the users of SCC are in the same block 3240 // as the SCC def. 3241 for (MachineInstr &MI : 3242 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 3243 SCCDefInst.getParent()->end())) { 3244 // Exit if we find another SCC def. 3245 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 3246 return; 3247 3248 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 3249 Worklist.push_back(&MI); 3250 } 3251 } 3252 3253 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 3254 const MachineInstr &Inst) const { 3255 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3256 3257 switch (Inst.getOpcode()) { 3258 // For target instructions, getOpRegClass just returns the virtual register 3259 // class associated with the operand, so we need to find an equivalent VGPR 3260 // register class in order to move the instruction to the VALU. 3261 case AMDGPU::COPY: 3262 case AMDGPU::PHI: 3263 case AMDGPU::REG_SEQUENCE: 3264 case AMDGPU::INSERT_SUBREG: 3265 if (RI.hasVGPRs(NewDstRC)) 3266 return nullptr; 3267 3268 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3269 if (!NewDstRC) 3270 return nullptr; 3271 return NewDstRC; 3272 default: 3273 return NewDstRC; 3274 } 3275 } 3276 3277 // Find the one SGPR operand we are allowed to use. 3278 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3279 int OpIndices[3]) const { 3280 const MCInstrDesc &Desc = MI.getDesc(); 3281 3282 // Find the one SGPR operand we are allowed to use. 3283 // 3284 // First we need to consider the instruction's operand requirements before 3285 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3286 // of VCC, but we are still bound by the constant bus requirement to only use 3287 // one. 3288 // 3289 // If the operand's class is an SGPR, we can never move it. 3290 3291 unsigned SGPRReg = findImplicitSGPRRead(MI); 3292 if (SGPRReg != AMDGPU::NoRegister) 3293 return SGPRReg; 3294 3295 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3296 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3297 3298 for (unsigned i = 0; i < 3; ++i) { 3299 int Idx = OpIndices[i]; 3300 if (Idx == -1) 3301 break; 3302 3303 const MachineOperand &MO = MI.getOperand(Idx); 3304 if (!MO.isReg()) 3305 continue; 3306 3307 // Is this operand statically required to be an SGPR based on the operand 3308 // constraints? 3309 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3310 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3311 if (IsRequiredSGPR) 3312 return MO.getReg(); 3313 3314 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3315 unsigned Reg = MO.getReg(); 3316 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3317 if (RI.isSGPRClass(RegRC)) 3318 UsedSGPRs[i] = Reg; 3319 } 3320 3321 // We don't have a required SGPR operand, so we have a bit more freedom in 3322 // selecting operands to move. 3323 3324 // Try to select the most used SGPR. If an SGPR is equal to one of the 3325 // others, we choose that. 3326 // 3327 // e.g. 3328 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3329 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3330 3331 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3332 // prefer those. 3333 3334 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3335 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3336 SGPRReg = UsedSGPRs[0]; 3337 } 3338 3339 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3340 if (UsedSGPRs[1] == UsedSGPRs[2]) 3341 SGPRReg = UsedSGPRs[1]; 3342 } 3343 3344 return SGPRReg; 3345 } 3346 3347 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3348 unsigned OperandName) const { 3349 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3350 if (Idx == -1) 3351 return nullptr; 3352 3353 return &MI.getOperand(Idx); 3354 } 3355 3356 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3357 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3358 if (ST.isAmdHsaOS()) { 3359 RsrcDataFormat |= (1ULL << 56); 3360 3361 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3362 // Set MTYPE = 2 3363 RsrcDataFormat |= (2ULL << 59); 3364 } 3365 3366 return RsrcDataFormat; 3367 } 3368 3369 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3370 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3371 AMDGPU::RSRC_TID_ENABLE | 3372 0xffffffff; // Size; 3373 3374 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3375 3376 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | 3377 // IndexStride = 64 3378 (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); 3379 3380 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 3381 // Clear them unless we want a huge stride. 3382 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3383 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 3384 3385 return Rsrc23; 3386 } 3387 3388 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 3389 unsigned Opc = MI.getOpcode(); 3390 3391 return isSMRD(Opc); 3392 } 3393 3394 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 3395 unsigned Opc = MI.getOpcode(); 3396 3397 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 3398 } 3399 3400 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 3401 int &FrameIndex) const { 3402 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3403 if (!Addr || !Addr->isFI()) 3404 return AMDGPU::NoRegister; 3405 3406 assert(!MI.memoperands_empty() && 3407 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 3408 3409 FrameIndex = Addr->getIndex(); 3410 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 3411 } 3412 3413 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 3414 int &FrameIndex) const { 3415 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 3416 assert(Addr && Addr->isFI()); 3417 FrameIndex = Addr->getIndex(); 3418 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 3419 } 3420 3421 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 3422 int &FrameIndex) const { 3423 3424 if (!MI.mayLoad()) 3425 return AMDGPU::NoRegister; 3426 3427 if (isMUBUF(MI) || isVGPRSpill(MI)) 3428 return isStackAccess(MI, FrameIndex); 3429 3430 if (isSGPRSpill(MI)) 3431 return isSGPRStackAccess(MI, FrameIndex); 3432 3433 return AMDGPU::NoRegister; 3434 } 3435 3436 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 3437 int &FrameIndex) const { 3438 if (!MI.mayStore()) 3439 return AMDGPU::NoRegister; 3440 3441 if (isMUBUF(MI) || isVGPRSpill(MI)) 3442 return isStackAccess(MI, FrameIndex); 3443 3444 if (isSGPRSpill(MI)) 3445 return isSGPRStackAccess(MI, FrameIndex); 3446 3447 return AMDGPU::NoRegister; 3448 } 3449 3450 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 3451 unsigned Opc = MI.getOpcode(); 3452 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 3453 unsigned DescSize = Desc.getSize(); 3454 3455 // If we have a definitive size, we can use it. Otherwise we need to inspect 3456 // the operands to know the size. 3457 if (DescSize != 0) 3458 return DescSize; 3459 3460 // 4-byte instructions may have a 32-bit literal encoded after them. Check 3461 // operands that coud ever be literals. 3462 if (isVALU(MI) || isSALU(MI)) { 3463 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3464 if (Src0Idx == -1) 3465 return 4; // No operands. 3466 3467 if (isLiteralConstantLike(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) 3468 return 8; 3469 3470 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3471 if (Src1Idx == -1) 3472 return 4; 3473 3474 if (isLiteralConstantLike(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) 3475 return 8; 3476 3477 return 4; 3478 } 3479 3480 switch (Opc) { 3481 case TargetOpcode::IMPLICIT_DEF: 3482 case TargetOpcode::KILL: 3483 case TargetOpcode::DBG_VALUE: 3484 case TargetOpcode::BUNDLE: 3485 case TargetOpcode::EH_LABEL: 3486 return 0; 3487 case TargetOpcode::INLINEASM: { 3488 const MachineFunction *MF = MI.getParent()->getParent(); 3489 const char *AsmStr = MI.getOperand(0).getSymbolName(); 3490 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 3491 } 3492 default: 3493 llvm_unreachable("unable to find instruction size"); 3494 } 3495 } 3496 3497 ArrayRef<std::pair<int, const char *>> 3498 SIInstrInfo::getSerializableTargetIndices() const { 3499 static const std::pair<int, const char *> TargetIndices[] = { 3500 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 3501 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 3502 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 3503 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 3504 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 3505 return makeArrayRef(TargetIndices); 3506 } 3507 3508 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 3509 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 3510 ScheduleHazardRecognizer * 3511 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 3512 const ScheduleDAG *DAG) const { 3513 return new GCNHazardRecognizer(DAG->MF); 3514 } 3515 3516 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 3517 /// pass. 3518 ScheduleHazardRecognizer * 3519 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 3520 return new GCNHazardRecognizer(MF); 3521 } 3522